1 //
    2 // Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
    3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4 //
    5 // This code is free software; you can redistribute it and/or modify it
    6 // under the terms of the GNU General Public License version 2 only, as
    7 // published by the Free Software Foundation.
    8 //
    9 // This code is distributed in the hope that it will be useful, but WITHOUT
   10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   12 // version 2 for more details (a copy is included in the LICENSE file that
   13 // accompanied this code).
   14 //
   15 // You should have received a copy of the GNU General Public License version
   16 // 2 along with this work; if not, write to the Free Software Foundation,
   17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   18 //
   19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
   20 // or visit www.oracle.com if you need additional information or have any
   21 // questions.
   22 //
   23 //
   24 
   25 // X86 Common Architecture Description File
   26 
   27 //----------REGISTER DEFINITION BLOCK------------------------------------------
   28 // This information is used by the matcher and the register allocator to
   29 // describe individual registers and classes of registers within the target
   30 // architecture.
   31 
   32 register %{
   33 //----------Architecture Description Register Definitions----------------------
   34 // General Registers
   35 // "reg_def"  name ( register save type, C convention save type,
   36 //                   ideal register type, encoding );
   37 // Register Save Types:
   38 //
   39 // NS  = No-Save:       The register allocator assumes that these registers
   40 //                      can be used without saving upon entry to the method, &
   41 //                      that they do not need to be saved at call sites.
   42 //
   43 // SOC = Save-On-Call:  The register allocator assumes that these registers
   44 //                      can be used without saving upon entry to the method,
   45 //                      but that they must be saved at call sites.
   46 //
   47 // SOE = Save-On-Entry: The register allocator assumes that these registers
   48 //                      must be saved before using them upon entry to the
   49 //                      method, but they do not need to be saved at call
   50 //                      sites.
   51 //
   52 // AS  = Always-Save:   The register allocator assumes that these registers
   53 //                      must be saved before using them upon entry to the
   54 //                      method, & that they must be saved at call sites.
   55 //
   56 // Ideal Register Type is used to determine how to save & restore a
   57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
   58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
   59 //
   60 // The encoding number is the actual bit-pattern placed into the opcodes.
   61 
   62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
   63 // Word a in each register holds a Float, words ab hold a Double.
   64 // The whole registers are used in SSE4.2 version intrinsics,
   65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
   66 // UseXMMForArrayCopy and UseSuperword flags).
   67 // For pre EVEX enabled architectures:
   68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
   69 // For EVEX enabled architectures:
   70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
   71 //
   72 // Linux ABI:   No register preserved across function calls
   73 //              XMM0-XMM7 might hold parameters
   74 // Windows ABI: XMM6-XMM31 preserved across function calls
   75 //              XMM0-XMM3 might hold parameters
   76 
   77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
   78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
   79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
   80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
   81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
   82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
   83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
   84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
   85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
   86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
   87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
   88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
   89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
   90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
   91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
   92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
   93 
   94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
   95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
   96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
   97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
   98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
   99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
  100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
  101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
  102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
  103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
  104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
  105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
  106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
  107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
  108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
  109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
  110 
  111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
  112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
  113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
  114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
  115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
  116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
  117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
  118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
  119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
  120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
  121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
  122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
  123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
  124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
  125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
  126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
  127 
  128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
  129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
  130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
  131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
  132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
  133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
  134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
  135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
  136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
  137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
  138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
  139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
  140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
  141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
  142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
  143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
  144 
  145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
  146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
  147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
  148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
  149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
  150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
  151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
  152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
  153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
  154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
  155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
  156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
  157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
  158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
  159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
  160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
  161 
  162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
  163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
  164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
  165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
  166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
  167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
  168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
  169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
  170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
  171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
  172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
  173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
  174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
  175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
  176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
  177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
  178 
  179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
  180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
  181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
  182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
  183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
  184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
  185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
  186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
  187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
  188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
  189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
  190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
  191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
  192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
  193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
  194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
  195 
  196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
  197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
  198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
  199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
  200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
  201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
  202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
  203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
  204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
  205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
  206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
  207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
  208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
  209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
  210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
  211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
  212 
  213 #ifdef _LP64
  214 
  215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
  216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
  217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
  218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
  219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
  220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
  221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
  222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
  223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
  224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
  225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
  226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
  227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
  228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
  229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
  230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
  231 
  232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
  233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
  234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
  235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
  236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
  237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
  238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
  239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
  240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
  241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
  242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
  243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
  244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
  245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
  246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
  247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
  248 
  249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
  250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
  251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
  252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
  253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
  254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
  255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
  256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
  257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
  258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
  259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
  260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
  261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
  262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
  263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
  264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
  265 
  266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
  267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
  268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
  269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
  270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
  271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
  272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
  273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
  274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
  275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
  276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
  277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
  278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
  279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
  280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
  281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
  282 
  283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
  284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
  285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
  286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
  287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
  288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
  289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
  290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
  291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
  292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
  293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
  294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
  295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
  296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
  297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
  298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
  299 
  300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
  301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
  302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
  303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
  304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
  305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
  306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
  307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
  308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
  309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
  310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
  311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
  312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
  313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
  314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
  315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
  316 
  317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
  318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
  319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
  320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
  321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
  322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
  323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
  324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
  325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
  326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
  327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
  328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
  329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
  330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
  331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
  332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
  333 
  334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
  335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
  336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
  337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
  338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
  339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
  340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
  341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
  342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
  343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
  344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
  345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
  346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
  347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
  348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
  349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
  350 
  351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
  352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
  353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
  354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
  355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
  356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
  357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
  358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
  359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
  360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
  361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
  362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
  363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
  364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
  365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
  366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
  367 
  368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
  369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
  370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
  371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
  372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
  373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
  374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
  375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
  376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
  377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
  378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
  379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
  380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
  381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
  382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
  383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
  384 
  385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
  386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
  387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
  388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
  389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
  390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
  391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
  392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
  393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
  394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
  395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
  396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
  397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
  398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
  399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
  400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
  401 
  402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
  403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
  404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
  405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
  406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
  407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
  408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
  409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
  410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
  411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
  412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
  413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
  414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
  415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
  416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
  417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
  418 
  419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
  420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
  421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
  422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
  423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
  424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
  425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
  426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
  427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
  428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
  429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
  430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
  431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
  432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
  433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
  434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
  435 
  436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
  437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
  438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
  439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
  440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
  441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
  442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
  443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
  444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
  445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
  446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
  447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
  448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
  449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
  450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
  451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
  452 
  453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
  454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
  455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
  456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
  457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
  458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
  459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
  460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
  461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
  462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
  463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
  464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
  465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
  466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
  467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
  468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
  469 
  470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
  471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
  472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
  473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
  474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
  475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
  476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
  477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
  478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
  479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
  480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
  481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
  482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
  483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
  484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
  485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
  486 
  487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
  488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
  489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
  490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
  491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
  492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
  493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
  494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
  495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
  496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
  497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
  498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
  499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
  500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
  501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
  502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
  503 
  504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
  505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
  506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
  507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
  508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
  509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
  510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
  511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
  512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
  513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
  514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
  515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
  516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
  517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
  518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
  519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
  520 
  521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
  522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
  523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
  524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
  525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
  526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
  527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
  528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
  529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
  530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
  531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
  532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
  533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
  534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
  535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
  536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
  537 
  538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
  539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
  540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
  541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
  542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
  543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
  544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
  545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
  546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
  547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
  548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
  549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
  550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
  551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
  552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
  553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
  554 
  555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
  556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
  557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
  558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
  559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
  560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
  561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
  562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
  563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
  564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
  565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
  566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
  567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
  568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
  569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
  570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
  571 
  572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
  573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
  574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
  575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
  576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
  577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
  578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
  579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
  580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
  581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
  582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
  583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
  584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
  585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
  586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
  587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
  588 
  589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
  590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
  591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
  592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
  593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
  594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
  595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
  596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
  597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
  598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
  599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
  600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
  601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
  602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
  603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
  604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
  605 
  606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
  607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
  608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
  609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
  610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
  611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
  612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
  613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
  614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
  615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
  616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
  617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
  618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
  619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
  620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
  621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
  622 
  623 #endif // _LP64
  624 
  625 #ifdef _LP64
  626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
  627 #else
  628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
  629 #endif // _LP64
  630 
  631 // AVX3 Mask Registers.
  632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
  633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
  634 
  635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
  636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
  637 
  638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
  639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
  640 
  641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
  642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
  643 
  644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
  645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
  646 
  647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
  648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
  649 
  650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
  651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
  652 
  653 
  654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
  655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
  656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
  657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
  658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
  659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
  660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
  661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
  662 #ifdef _LP64
  663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
  664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
  665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
  666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
  667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
  668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
  669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
  670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
  671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
  672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
  673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
  674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
  675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
  676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
  677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
  678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
  679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
  680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
  681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
  682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
  683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
  684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
  685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
  686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
  687 #endif
  688                       );
  689 
  690 alloc_class chunk2(K7, K7_H,
  691                    K6, K6_H,
  692                    K5, K5_H,
  693                    K4, K4_H,
  694                    K3, K3_H,
  695                    K2, K2_H,
  696                    K1, K1_H);
  697 
  698 reg_class  vectmask_reg(K1, K1_H,
  699                         K2, K2_H,
  700                         K3, K3_H,
  701                         K4, K4_H,
  702                         K5, K5_H,
  703                         K6, K6_H,
  704                         K7, K7_H);
  705 
  706 reg_class vectmask_reg_K1(K1, K1_H);
  707 reg_class vectmask_reg_K2(K2, K2_H);
  708 reg_class vectmask_reg_K3(K3, K3_H);
  709 reg_class vectmask_reg_K4(K4, K4_H);
  710 reg_class vectmask_reg_K5(K5, K5_H);
  711 reg_class vectmask_reg_K6(K6, K6_H);
  712 reg_class vectmask_reg_K7(K7, K7_H);
  713 
  714 // flags allocation class should be last.
  715 alloc_class chunk3(RFLAGS);
  716 
  717 
  718 // Singleton class for condition codes
  719 reg_class int_flags(RFLAGS);
  720 
  721 // Class for pre evex float registers
  722 reg_class float_reg_legacy(XMM0,
  723                     XMM1,
  724                     XMM2,
  725                     XMM3,
  726                     XMM4,
  727                     XMM5,
  728                     XMM6,
  729                     XMM7
  730 #ifdef _LP64
  731                    ,XMM8,
  732                     XMM9,
  733                     XMM10,
  734                     XMM11,
  735                     XMM12,
  736                     XMM13,
  737                     XMM14,
  738                     XMM15
  739 #endif
  740                     );
  741 
  742 // Class for evex float registers
  743 reg_class float_reg_evex(XMM0,
  744                     XMM1,
  745                     XMM2,
  746                     XMM3,
  747                     XMM4,
  748                     XMM5,
  749                     XMM6,
  750                     XMM7
  751 #ifdef _LP64
  752                    ,XMM8,
  753                     XMM9,
  754                     XMM10,
  755                     XMM11,
  756                     XMM12,
  757                     XMM13,
  758                     XMM14,
  759                     XMM15,
  760                     XMM16,
  761                     XMM17,
  762                     XMM18,
  763                     XMM19,
  764                     XMM20,
  765                     XMM21,
  766                     XMM22,
  767                     XMM23,
  768                     XMM24,
  769                     XMM25,
  770                     XMM26,
  771                     XMM27,
  772                     XMM28,
  773                     XMM29,
  774                     XMM30,
  775                     XMM31
  776 #endif
  777                     );
  778 
  779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
  780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  781 
  782 // Class for pre evex double registers
  783 reg_class double_reg_legacy(XMM0,  XMM0b,
  784                      XMM1,  XMM1b,
  785                      XMM2,  XMM2b,
  786                      XMM3,  XMM3b,
  787                      XMM4,  XMM4b,
  788                      XMM5,  XMM5b,
  789                      XMM6,  XMM6b,
  790                      XMM7,  XMM7b
  791 #ifdef _LP64
  792                     ,XMM8,  XMM8b,
  793                      XMM9,  XMM9b,
  794                      XMM10, XMM10b,
  795                      XMM11, XMM11b,
  796                      XMM12, XMM12b,
  797                      XMM13, XMM13b,
  798                      XMM14, XMM14b,
  799                      XMM15, XMM15b
  800 #endif
  801                      );
  802 
  803 // Class for evex double registers
  804 reg_class double_reg_evex(XMM0,  XMM0b,
  805                      XMM1,  XMM1b,
  806                      XMM2,  XMM2b,
  807                      XMM3,  XMM3b,
  808                      XMM4,  XMM4b,
  809                      XMM5,  XMM5b,
  810                      XMM6,  XMM6b,
  811                      XMM7,  XMM7b
  812 #ifdef _LP64
  813                     ,XMM8,  XMM8b,
  814                      XMM9,  XMM9b,
  815                      XMM10, XMM10b,
  816                      XMM11, XMM11b,
  817                      XMM12, XMM12b,
  818                      XMM13, XMM13b,
  819                      XMM14, XMM14b,
  820                      XMM15, XMM15b,
  821                      XMM16, XMM16b,
  822                      XMM17, XMM17b,
  823                      XMM18, XMM18b,
  824                      XMM19, XMM19b,
  825                      XMM20, XMM20b,
  826                      XMM21, XMM21b,
  827                      XMM22, XMM22b,
  828                      XMM23, XMM23b,
  829                      XMM24, XMM24b,
  830                      XMM25, XMM25b,
  831                      XMM26, XMM26b,
  832                      XMM27, XMM27b,
  833                      XMM28, XMM28b,
  834                      XMM29, XMM29b,
  835                      XMM30, XMM30b,
  836                      XMM31, XMM31b
  837 #endif
  838                      );
  839 
  840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
  841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
  842 
  843 // Class for pre evex 32bit vector registers
  844 reg_class vectors_reg_legacy(XMM0,
  845                       XMM1,
  846                       XMM2,
  847                       XMM3,
  848                       XMM4,
  849                       XMM5,
  850                       XMM6,
  851                       XMM7
  852 #ifdef _LP64
  853                      ,XMM8,
  854                       XMM9,
  855                       XMM10,
  856                       XMM11,
  857                       XMM12,
  858                       XMM13,
  859                       XMM14,
  860                       XMM15
  861 #endif
  862                       );
  863 
  864 // Class for evex 32bit vector registers
  865 reg_class vectors_reg_evex(XMM0,
  866                       XMM1,
  867                       XMM2,
  868                       XMM3,
  869                       XMM4,
  870                       XMM5,
  871                       XMM6,
  872                       XMM7
  873 #ifdef _LP64
  874                      ,XMM8,
  875                       XMM9,
  876                       XMM10,
  877                       XMM11,
  878                       XMM12,
  879                       XMM13,
  880                       XMM14,
  881                       XMM15,
  882                       XMM16,
  883                       XMM17,
  884                       XMM18,
  885                       XMM19,
  886                       XMM20,
  887                       XMM21,
  888                       XMM22,
  889                       XMM23,
  890                       XMM24,
  891                       XMM25,
  892                       XMM26,
  893                       XMM27,
  894                       XMM28,
  895                       XMM29,
  896                       XMM30,
  897                       XMM31
  898 #endif
  899                       );
  900 
  901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
  902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  903 
  904 // Class for all 64bit vector registers
  905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
  906                       XMM1,  XMM1b,
  907                       XMM2,  XMM2b,
  908                       XMM3,  XMM3b,
  909                       XMM4,  XMM4b,
  910                       XMM5,  XMM5b,
  911                       XMM6,  XMM6b,
  912                       XMM7,  XMM7b
  913 #ifdef _LP64
  914                      ,XMM8,  XMM8b,
  915                       XMM9,  XMM9b,
  916                       XMM10, XMM10b,
  917                       XMM11, XMM11b,
  918                       XMM12, XMM12b,
  919                       XMM13, XMM13b,
  920                       XMM14, XMM14b,
  921                       XMM15, XMM15b
  922 #endif
  923                       );
  924 
  925 // Class for all 64bit vector registers
  926 reg_class vectord_reg_evex(XMM0,  XMM0b,
  927                       XMM1,  XMM1b,
  928                       XMM2,  XMM2b,
  929                       XMM3,  XMM3b,
  930                       XMM4,  XMM4b,
  931                       XMM5,  XMM5b,
  932                       XMM6,  XMM6b,
  933                       XMM7,  XMM7b
  934 #ifdef _LP64
  935                      ,XMM8,  XMM8b,
  936                       XMM9,  XMM9b,
  937                       XMM10, XMM10b,
  938                       XMM11, XMM11b,
  939                       XMM12, XMM12b,
  940                       XMM13, XMM13b,
  941                       XMM14, XMM14b,
  942                       XMM15, XMM15b,
  943                       XMM16, XMM16b,
  944                       XMM17, XMM17b,
  945                       XMM18, XMM18b,
  946                       XMM19, XMM19b,
  947                       XMM20, XMM20b,
  948                       XMM21, XMM21b,
  949                       XMM22, XMM22b,
  950                       XMM23, XMM23b,
  951                       XMM24, XMM24b,
  952                       XMM25, XMM25b,
  953                       XMM26, XMM26b,
  954                       XMM27, XMM27b,
  955                       XMM28, XMM28b,
  956                       XMM29, XMM29b,
  957                       XMM30, XMM30b,
  958                       XMM31, XMM31b
  959 #endif
  960                       );
  961 
  962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
  963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
  964 
  965 // Class for all 128bit vector registers
  966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
  967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  973                       XMM7,  XMM7b,  XMM7c,  XMM7d
  974 #ifdef _LP64
  975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  977                       XMM10, XMM10b, XMM10c, XMM10d,
  978                       XMM11, XMM11b, XMM11c, XMM11d,
  979                       XMM12, XMM12b, XMM12c, XMM12d,
  980                       XMM13, XMM13b, XMM13c, XMM13d,
  981                       XMM14, XMM14b, XMM14c, XMM14d,
  982                       XMM15, XMM15b, XMM15c, XMM15d
  983 #endif
  984                       );
  985 
  986 // Class for all 128bit vector registers
  987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
  988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
  989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
  990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
  991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
  992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
  993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
  994                       XMM7,  XMM7b,  XMM7c,  XMM7d
  995 #ifdef _LP64
  996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
  997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
  998                       XMM10, XMM10b, XMM10c, XMM10d,
  999                       XMM11, XMM11b, XMM11c, XMM11d,
 1000                       XMM12, XMM12b, XMM12c, XMM12d,
 1001                       XMM13, XMM13b, XMM13c, XMM13d,
 1002                       XMM14, XMM14b, XMM14c, XMM14d,
 1003                       XMM15, XMM15b, XMM15c, XMM15d,
 1004                       XMM16, XMM16b, XMM16c, XMM16d,
 1005                       XMM17, XMM17b, XMM17c, XMM17d,
 1006                       XMM18, XMM18b, XMM18c, XMM18d,
 1007                       XMM19, XMM19b, XMM19c, XMM19d,
 1008                       XMM20, XMM20b, XMM20c, XMM20d,
 1009                       XMM21, XMM21b, XMM21c, XMM21d,
 1010                       XMM22, XMM22b, XMM22c, XMM22d,
 1011                       XMM23, XMM23b, XMM23c, XMM23d,
 1012                       XMM24, XMM24b, XMM24c, XMM24d,
 1013                       XMM25, XMM25b, XMM25c, XMM25d,
 1014                       XMM26, XMM26b, XMM26c, XMM26d,
 1015                       XMM27, XMM27b, XMM27c, XMM27d,
 1016                       XMM28, XMM28b, XMM28c, XMM28d,
 1017                       XMM29, XMM29b, XMM29c, XMM29d,
 1018                       XMM30, XMM30b, XMM30c, XMM30d,
 1019                       XMM31, XMM31b, XMM31c, XMM31d
 1020 #endif
 1021                       );
 1022 
 1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1025 
 1026 // Class for all 256bit vector registers
 1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1035 #ifdef _LP64
 1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 1044 #endif
 1045                       );
 1046 
 1047 // Class for all 256bit vector registers
 1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 1056 #ifdef _LP64
 1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
 1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
 1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
 1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
 1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
 1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
 1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
 1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
 1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
 1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
 1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
 1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
 1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
 1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
 1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
 1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
 1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
 1081 #endif
 1082                       );
 1083 
 1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
 1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 1086 
 1087 // Class for all 512bit vector registers
 1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1096 #ifdef _LP64
 1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 1121 #endif
 1122                       );
 1123 
 1124 // Class for restricted 512bit vector registers
 1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 1133 #ifdef _LP64
 1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 1142 #endif
 1143                       );
 1144 
 1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
 1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 1147 
 1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
 1149 %}
 1150 
 1151 
 1152 //----------SOURCE BLOCK-------------------------------------------------------
 1153 // This is a block of C++ code which provides values, functions, and
 1154 // definitions necessary in the rest of the architecture description
 1155 
 1156 source_hpp %{
 1157 // Header information of the source block.
 1158 // Method declarations/definitions which are used outside
 1159 // the ad-scope can conveniently be defined here.
 1160 //
 1161 // To keep related declarations/definitions/uses close together,
 1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
 1163 
 1164 #include "runtime/vm_version.hpp"
 1165 
 1166 class NativeJump;
 1167 
 1168 class CallStubImpl {
 1169 
 1170   //--------------------------------------------------------------
 1171   //---<  Used for optimization in Compile::shorten_branches  >---
 1172   //--------------------------------------------------------------
 1173 
 1174  public:
 1175   // Size of call trampoline stub.
 1176   static uint size_call_trampoline() {
 1177     return 0; // no call trampolines on this platform
 1178   }
 1179 
 1180   // number of relocations needed by a call trampoline stub
 1181   static uint reloc_call_trampoline() {
 1182     return 0; // no call trampolines on this platform
 1183   }
 1184 };
 1185 
 1186 class HandlerImpl {
 1187 
 1188  public:
 1189 
 1190   static int emit_exception_handler(CodeBuffer &cbuf);
 1191   static int emit_deopt_handler(CodeBuffer& cbuf);
 1192 
 1193   static uint size_exception_handler() {
 1194     // NativeCall instruction size is the same as NativeJump.
 1195     // exception handler starts out as jump and can be patched to
 1196     // a call be deoptimization.  (4932387)
 1197     // Note that this value is also credited (in output.cpp) to
 1198     // the size of the code section.
 1199     return NativeJump::instruction_size;
 1200   }
 1201 
 1202 #ifdef _LP64
 1203   static uint size_deopt_handler() {
 1204     // three 5 byte instructions plus one move for unreachable address.
 1205     return 15+3;
 1206   }
 1207 #else
 1208   static uint size_deopt_handler() {
 1209     // NativeCall instruction size is the same as NativeJump.
 1210     // exception handler starts out as jump and can be patched to
 1211     // a call be deoptimization.  (4932387)
 1212     // Note that this value is also credited (in output.cpp) to
 1213     // the size of the code section.
 1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
 1215   }
 1216 #endif
 1217 };
 1218 
 1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
 1220   switch(bytes) {
 1221     case  4: // fall-through
 1222     case  8: // fall-through
 1223     case 16: return Assembler::AVX_128bit;
 1224     case 32: return Assembler::AVX_256bit;
 1225     case 64: return Assembler::AVX_512bit;
 1226 
 1227     default: {
 1228       ShouldNotReachHere();
 1229       return Assembler::AVX_NoVec;
 1230     }
 1231   }
 1232 }
 1233 
 1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
 1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
 1236 }
 1237 
 1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
 1239   uint def_idx = use->operand_index(opnd);
 1240   Node* def = use->in(def_idx);
 1241   return vector_length_encoding(def);
 1242 }
 1243 
 1244 static inline bool is_vector_popcount_predicate(BasicType bt) {
 1245   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1246          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1247 }
 1248 
 1249 static inline bool is_unsigned_booltest_pred(int bt) {
 1250   return  ((bt & BoolTest::unsigned_compare) == BoolTest::unsigned_compare);
 1251 }
 1252 
 1253 static inline bool is_clz_non_subword_predicate_evex(BasicType bt, int vlen_bytes) {
 1254   return is_non_subword_integral_type(bt) && VM_Version::supports_avx512cd() &&
 1255            (VM_Version::supports_avx512vl() || vlen_bytes == 64);
 1256 }
 1257 
 1258 class Node::PD {
 1259 public:
 1260   enum NodeFlags {
 1261     Flag_intel_jcc_erratum = Node::_last_flag << 1,
 1262     _last_flag             = Flag_intel_jcc_erratum
 1263   };
 1264 };
 1265 
 1266 %} // end source_hpp
 1267 
 1268 source %{
 1269 
 1270 #include "opto/addnode.hpp"
 1271 #include "c2_intelJccErratum_x86.hpp"
 1272 
 1273 void PhaseOutput::pd_perform_mach_node_analysis() {
 1274   if (VM_Version::has_intel_jcc_erratum()) {
 1275     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
 1276     _buf_sizes._code += extra_padding;
 1277   }
 1278 }
 1279 
 1280 int MachNode::pd_alignment_required() const {
 1281   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
 1282     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
 1283     return IntelJccErratum::largest_jcc_size() + 1;
 1284   } else {
 1285     return 1;
 1286   }
 1287 }
 1288 
 1289 int MachNode::compute_padding(int current_offset) const {
 1290   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
 1291     Compile* C = Compile::current();
 1292     PhaseOutput* output = C->output();
 1293     Block* block = output->block();
 1294     int index = output->index();
 1295     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
 1296   } else {
 1297     return 0;
 1298   }
 1299 }
 1300 
 1301 // Emit exception handler code.
 1302 // Stuff framesize into a register and call a VM stub routine.
 1303 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
 1304 
 1305   // Note that the code buffer's insts_mark is always relative to insts.
 1306   // That's why we must use the macroassembler to generate a handler.
 1307   C2_MacroAssembler _masm(&cbuf);
 1308   address base = __ start_a_stub(size_exception_handler());
 1309   if (base == NULL) {
 1310     ciEnv::current()->record_failure("CodeCache is full");
 1311     return 0;  // CodeBuffer::expand failed
 1312   }
 1313   int offset = __ offset();
 1314   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
 1315   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
 1316   __ end_a_stub();
 1317   return offset;
 1318 }
 1319 
 1320 // Emit deopt handler code.
 1321 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
 1322 
 1323   // Note that the code buffer's insts_mark is always relative to insts.
 1324   // That's why we must use the macroassembler to generate a handler.
 1325   C2_MacroAssembler _masm(&cbuf);
 1326   address base = __ start_a_stub(size_deopt_handler());
 1327   if (base == NULL) {
 1328     ciEnv::current()->record_failure("CodeCache is full");
 1329     return 0;  // CodeBuffer::expand failed
 1330   }
 1331   int offset = __ offset();
 1332 
 1333 #ifdef _LP64
 1334   address the_pc = (address) __ pc();
 1335   Label next;
 1336   // push a "the_pc" on the stack without destroying any registers
 1337   // as they all may be live.
 1338 
 1339   // push address of "next"
 1340   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
 1341   __ bind(next);
 1342   // adjust it so it matches "the_pc"
 1343   __ subptr(Address(rsp, 0), __ offset() - offset);
 1344 #else
 1345   InternalAddress here(__ pc());
 1346   __ pushptr(here.addr());
 1347 #endif
 1348 
 1349   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
 1350   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
 1351   __ end_a_stub();
 1352   return offset;
 1353 }
 1354 
 1355 Assembler::Width widthForType(BasicType bt) {
 1356   if (bt == T_BYTE) {
 1357     return Assembler::B;
 1358   } else if (bt == T_SHORT) {
 1359     return Assembler::W;
 1360   } else if (bt == T_INT) {
 1361     return Assembler::D;
 1362   } else {
 1363     assert(bt == T_LONG, "not a long: %s", type2name(bt));
 1364     return Assembler::Q;
 1365   }
 1366 }
 1367 
 1368 //=============================================================================
 1369 
 1370   // Float masks come from different places depending on platform.
 1371 #ifdef _LP64
 1372   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
 1373   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
 1374   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
 1375   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
 1376 #else
 1377   static address float_signmask()  { return (address)float_signmask_pool; }
 1378   static address float_signflip()  { return (address)float_signflip_pool; }
 1379   static address double_signmask() { return (address)double_signmask_pool; }
 1380   static address double_signflip() { return (address)double_signflip_pool; }
 1381 #endif
 1382   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
 1383   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
 1384   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
 1385   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
 1386   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
 1387   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
 1388   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
 1389   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
 1390   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
 1391   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
 1392   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
 1393   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
 1394   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
 1395   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
 1396   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
 1397 
 1398 //=============================================================================
 1399 const bool Matcher::match_rule_supported(int opcode) {
 1400   if (!has_match_rule(opcode)) {
 1401     return false; // no match rule present
 1402   }
 1403   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1404   switch (opcode) {
 1405     case Op_AbsVL:
 1406     case Op_StoreVectorScatter:
 1407       if (UseAVX < 3) {
 1408         return false;
 1409       }
 1410       break;
 1411     case Op_PopCountI:
 1412     case Op_PopCountL:
 1413       if (!UsePopCountInstruction) {
 1414         return false;
 1415       }
 1416       break;
 1417     case Op_PopCountVI:
 1418       if (UseAVX < 2) {
 1419         return false;
 1420       }
 1421       break;
 1422     case Op_PopCountVL:
 1423       if (UseAVX < 2) {
 1424         return false;
 1425       }
 1426       break;
 1427     case Op_MulVI:
 1428       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
 1429         return false;
 1430       }
 1431       break;
 1432     case Op_MulVL:
 1433       if (UseSSE < 4) { // only with SSE4_1 or AVX
 1434         return false;
 1435       }
 1436       break;
 1437     case Op_MulReductionVL:
 1438       if (VM_Version::supports_avx512dq() == false) {
 1439         return false;
 1440       }
 1441       break;
 1442     case Op_AddReductionVL:
 1443       if (UseSSE < 2) { // requires at least SSE2
 1444         return false;
 1445       }
 1446       break;
 1447     case Op_AbsVB:
 1448     case Op_AbsVS:
 1449     case Op_AbsVI:
 1450     case Op_AddReductionVI:
 1451     case Op_AndReductionV:
 1452     case Op_OrReductionV:
 1453     case Op_XorReductionV:
 1454       if (UseSSE < 3) { // requires at least SSSE3
 1455         return false;
 1456       }
 1457       break;
 1458     case Op_VectorLoadShuffle:
 1459     case Op_VectorRearrange:
 1460     case Op_MulReductionVI:
 1461       if (UseSSE < 4) { // requires at least SSE4
 1462         return false;
 1463       }
 1464       break;
 1465     case Op_SqrtVD:
 1466     case Op_SqrtVF:
 1467     case Op_VectorMaskCmp:
 1468     case Op_VectorCastB2X:
 1469     case Op_VectorCastS2X:
 1470     case Op_VectorCastI2X:
 1471     case Op_VectorCastL2X:
 1472     case Op_VectorCastF2X:
 1473     case Op_VectorCastD2X:
 1474     case Op_VectorUCastB2X:
 1475     case Op_VectorUCastS2X:
 1476     case Op_VectorUCastI2X:
 1477       if (UseAVX < 1) { // enabled for AVX only
 1478         return false;
 1479       }
 1480       break;
 1481     case Op_RoundVF:
 1482       if (UseAVX < 2) { // enabled for AVX2 only
 1483         return false;
 1484       }
 1485       break;
 1486     case Op_RoundVD:
 1487       if (UseAVX < 3) {
 1488         return false;  // enabled for AVX3 only
 1489       }
 1490       break;
 1491     case Op_CompareAndSwapL:
 1492 #ifdef _LP64
 1493     case Op_CompareAndSwapP:
 1494 #endif
 1495       if (!VM_Version::supports_cx8()) {
 1496         return false;
 1497       }
 1498       break;
 1499     case Op_CMoveVF:
 1500     case Op_CMoveVD:
 1501       if (UseAVX < 1) { // enabled for AVX only
 1502         return false;
 1503       }
 1504       break;
 1505     case Op_StrIndexOf:
 1506       if (!UseSSE42Intrinsics) {
 1507         return false;
 1508       }
 1509       break;
 1510     case Op_StrIndexOfChar:
 1511       if (!UseSSE42Intrinsics) {
 1512         return false;
 1513       }
 1514       break;
 1515     case Op_OnSpinWait:
 1516       if (VM_Version::supports_on_spin_wait() == false) {
 1517         return false;
 1518       }
 1519       break;
 1520     case Op_MulVB:
 1521     case Op_LShiftVB:
 1522     case Op_RShiftVB:
 1523     case Op_URShiftVB:
 1524     case Op_VectorInsert:
 1525     case Op_VectorLoadMask:
 1526     case Op_VectorStoreMask:
 1527     case Op_VectorBlend:
 1528       if (UseSSE < 4) {
 1529         return false;
 1530       }
 1531       break;
 1532 #ifdef _LP64
 1533     case Op_MaxD:
 1534     case Op_MaxF:
 1535     case Op_MinD:
 1536     case Op_MinF:
 1537       if (UseAVX < 1) { // enabled for AVX only
 1538         return false;
 1539       }
 1540       break;
 1541 #endif
 1542     case Op_CacheWB:
 1543     case Op_CacheWBPreSync:
 1544     case Op_CacheWBPostSync:
 1545       if (!VM_Version::supports_data_cache_line_flush()) {
 1546         return false;
 1547       }
 1548       break;
 1549     case Op_ExtractB:
 1550     case Op_ExtractL:
 1551     case Op_ExtractI:
 1552     case Op_RoundDoubleMode:
 1553       if (UseSSE < 4) {
 1554         return false;
 1555       }
 1556       break;
 1557     case Op_RoundDoubleModeV:
 1558       if (VM_Version::supports_avx() == false) {
 1559         return false; // 128bit vroundpd is not available
 1560       }
 1561       break;
 1562     case Op_LoadVectorGather:
 1563       if (UseAVX < 2) {
 1564         return false;
 1565       }
 1566       break;
 1567     case Op_FmaVD:
 1568     case Op_FmaVF:
 1569       if (!UseFMA) {
 1570         return false;
 1571       }
 1572       break;
 1573     case Op_MacroLogicV:
 1574       if (UseAVX < 3 || !UseVectorMacroLogic) {
 1575         return false;
 1576       }
 1577       break;
 1578 
 1579     case Op_VectorCmpMasked:
 1580     case Op_VectorMaskGen:
 1581     case Op_LoadVectorMasked:
 1582     case Op_StoreVectorMasked:
 1583       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1584         return false;
 1585       }
 1586       break;
 1587     case Op_VectorMaskFirstTrue:
 1588     case Op_VectorMaskLastTrue:
 1589     case Op_VectorMaskTrueCount:
 1590     case Op_VectorMaskToLong:
 1591       if (!is_LP64 || UseAVX < 1) {
 1592          return false;
 1593       }
 1594       break;
 1595     case Op_RoundF:
 1596     case Op_RoundD:
 1597       if (!is_LP64) {
 1598         return false;
 1599       }
 1600       break;
 1601     case Op_CopySignD:
 1602     case Op_CopySignF:
 1603       if (UseAVX < 3 || !is_LP64)  {
 1604         return false;
 1605       }
 1606       if (!VM_Version::supports_avx512vl()) {
 1607         return false;
 1608       }
 1609       break;
 1610 #ifndef _LP64
 1611     case Op_AddReductionVF:
 1612     case Op_AddReductionVD:
 1613     case Op_MulReductionVF:
 1614     case Op_MulReductionVD:
 1615       if (UseSSE < 1) { // requires at least SSE
 1616         return false;
 1617       }
 1618       break;
 1619     case Op_MulAddVS2VI:
 1620     case Op_RShiftVL:
 1621     case Op_AbsVD:
 1622     case Op_NegVD:
 1623       if (UseSSE < 2) {
 1624         return false;
 1625       }
 1626       break;
 1627 #endif // !LP64
 1628     case Op_SignumF:
 1629       if (UseSSE < 1) {
 1630         return false;
 1631       }
 1632       break;
 1633     case Op_SignumD:
 1634       if (UseSSE < 2) {
 1635         return false;
 1636       }
 1637       break;
 1638     case Op_CompressM:
 1639       if (!VM_Version::supports_avx512vl() || !VM_Version::supports_bmi2()) {
 1640         return false;
 1641       }
 1642       break;
 1643     case Op_CompressV:
 1644     case Op_ExpandV:
 1645       if (!VM_Version::supports_avx512vl()) {
 1646         return false;
 1647       }
 1648       break;
 1649     case Op_SqrtF:
 1650       if (UseSSE < 1) {
 1651         return false;
 1652       }
 1653       break;
 1654     case Op_SqrtD:
 1655 #ifdef _LP64
 1656       if (UseSSE < 2) {
 1657         return false;
 1658       }
 1659 #else
 1660       // x86_32.ad has a special match rule for SqrtD.
 1661       // Together with common x86 rules, this handles all UseSSE cases.
 1662 #endif
 1663       break;
 1664   }
 1665   return true;  // Match rules are supported by default.
 1666 }
 1667 
 1668 //------------------------------------------------------------------------
 1669 
 1670 static inline bool is_pop_count_instr_target(BasicType bt) {
 1671   return (is_subword_type(bt) && VM_Version::supports_avx512_bitalg()) ||
 1672          (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq());
 1673 }
 1674 
 1675 // Identify extra cases that we might want to provide match rules for vector nodes and
 1676 // other intrinsics guarded with vector length (vlen) and element type (bt).
 1677 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
 1678   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1679   if (!match_rule_supported(opcode)) {
 1680     return false;
 1681   }
 1682   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
 1683   //   * SSE2 supports 128bit vectors for all types;
 1684   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
 1685   //   * AVX2 supports 256bit vectors for all types;
 1686   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
 1687   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
 1688   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
 1689   // And MaxVectorSize is taken into account as well.
 1690   if (!vector_size_supported(bt, vlen)) {
 1691     return false;
 1692   }
 1693   // Special cases which require vector length follow:
 1694   //   * implementation limitations
 1695   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
 1696   //   * 128bit vroundpd instruction is present only in AVX1
 1697   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1698   switch (opcode) {
 1699     case Op_AbsVF:
 1700     case Op_NegVF:
 1701       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
 1702         return false; // 512bit vandps and vxorps are not available
 1703       }
 1704       break;
 1705     case Op_AbsVD:
 1706     case Op_NegVD:
 1707     case Op_MulVL:
 1708       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
 1709         return false; // 512bit vpmullq, vandpd and vxorpd are not available
 1710       }
 1711       break;
 1712     case Op_CMoveVF:
 1713       if (vlen != 8) {
 1714         return false; // implementation limitation (only vcmov8F_reg is present)
 1715       }
 1716       break;
 1717     case Op_RotateRightV:
 1718     case Op_RotateLeftV:
 1719       if (bt != T_INT && bt != T_LONG) {
 1720         return false;
 1721       } // fallthrough
 1722     case Op_MacroLogicV:
 1723       if (!VM_Version::supports_evex() ||
 1724           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
 1725         return false;
 1726       }
 1727       break;
 1728     case Op_ClearArray:
 1729     case Op_VectorMaskGen:
 1730     case Op_VectorCmpMasked:
 1731     case Op_LoadVectorMasked:
 1732     case Op_StoreVectorMasked:
 1733       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
 1734         return false;
 1735       }
 1736       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
 1737         return false;
 1738       }
 1739       break;
 1740     case Op_CMoveVD:
 1741       if (vlen != 4) {
 1742         return false; // implementation limitation (only vcmov4D_reg is present)
 1743       }
 1744       break;
 1745     case Op_MaxV:
 1746     case Op_MinV:
 1747       if (UseSSE < 4 && is_integral_type(bt)) {
 1748         return false;
 1749       }
 1750       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
 1751           // Float/Double intrinsics are enabled for AVX family currently.
 1752           if (UseAVX == 0) {
 1753             return false;
 1754           }
 1755           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
 1756             return false;
 1757           }
 1758       }
 1759       break;
 1760     case Op_CallLeafVector:
 1761       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
 1762         return false;
 1763       }
 1764       break;
 1765     case Op_AddReductionVI:
 1766       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
 1767         return false;
 1768       }
 1769       // fallthrough
 1770     case Op_AndReductionV:
 1771     case Op_OrReductionV:
 1772     case Op_XorReductionV:
 1773       if (is_subword_type(bt) && (UseSSE < 4)) {
 1774         return false;
 1775       }
 1776 #ifndef _LP64
 1777       if (bt == T_BYTE || bt == T_LONG) {
 1778         return false;
 1779       }
 1780 #endif
 1781       break;
 1782 #ifndef _LP64
 1783     case Op_VectorInsert:
 1784       if (bt == T_LONG || bt == T_DOUBLE) {
 1785         return false;
 1786       }
 1787       break;
 1788 #endif
 1789     case Op_MinReductionV:
 1790     case Op_MaxReductionV:
 1791       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
 1792         return false;
 1793       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
 1794         return false;
 1795       }
 1796       // Float/Double intrinsics enabled for AVX family.
 1797       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
 1798         return false;
 1799       }
 1800       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
 1801         return false;
 1802       }
 1803 #ifndef _LP64
 1804       if (bt == T_BYTE || bt == T_LONG) {
 1805         return false;
 1806       }
 1807 #endif
 1808       break;
 1809     case Op_VectorTest:
 1810       if (UseSSE < 4) {
 1811         return false; // Implementation limitation
 1812       } else if (size_in_bits < 32) {
 1813         return false; // Implementation limitation
 1814       } else if (size_in_bits == 512 && (VM_Version::supports_avx512bw() == false)) {
 1815         return false; // Implementation limitation
 1816       }
 1817       break;
 1818     case Op_VectorLoadShuffle:
 1819     case Op_VectorRearrange:
 1820       if(vlen == 2) {
 1821         return false; // Implementation limitation due to how shuffle is loaded
 1822       } else if (size_in_bits == 256 && UseAVX < 2) {
 1823         return false; // Implementation limitation
 1824       } else if (bt == T_BYTE && size_in_bits > 256 && !VM_Version::supports_avx512_vbmi())  {
 1825         return false; // Implementation limitation
 1826       } else if (bt == T_SHORT && size_in_bits > 256 && !VM_Version::supports_avx512bw())  {
 1827         return false; // Implementation limitation
 1828       }
 1829       break;
 1830     case Op_VectorLoadMask:
 1831       if (size_in_bits == 256 && UseAVX < 2) {
 1832         return false; // Implementation limitation
 1833       }
 1834       // fallthrough
 1835     case Op_VectorStoreMask:
 1836       if (vlen == 2) {
 1837         return false; // Implementation limitation
 1838       }
 1839       break;
 1840     case Op_VectorCastB2X:
 1841     case Op_VectorCastS2X:
 1842     case Op_VectorCastI2X:
 1843       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
 1844         return false;
 1845       }
 1846       break;
 1847     case Op_VectorCastL2X:
 1848       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
 1849         return false;
 1850       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
 1851         return false;
 1852       }
 1853       break;
 1854     case Op_VectorCastD2X:
 1855       if (is_subword_type(bt) || bt == T_INT) {
 1856         return false;
 1857       }
 1858       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
 1859         return false;
 1860       }
 1861       break;
 1862     case Op_RoundVD:
 1863       if (!VM_Version::supports_avx512dq()) {
 1864         return false;
 1865       }
 1866       break;
 1867     case Op_VectorCastF2X:
 1868       if (is_subword_type(bt) || bt == T_LONG) {
 1869         return false;
 1870       }
 1871       break;
 1872     case Op_MulReductionVI:
 1873       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
 1874         return false;
 1875       }
 1876       break;
 1877     case Op_LoadVectorGatherMasked:
 1878     case Op_StoreVectorScatterMasked:
 1879     case Op_StoreVectorScatter:
 1880       if (is_subword_type(bt)) {
 1881         return false;
 1882       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1883         return false;
 1884       }
 1885       // fallthrough
 1886     case Op_LoadVectorGather:
 1887       if (size_in_bits == 64 ) {
 1888         return false;
 1889       }
 1890       break;
 1891     case Op_MaskAll:
 1892       if (!VM_Version::supports_evex()) {
 1893         return false;
 1894       }
 1895       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
 1896         return false;
 1897       }
 1898       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1899         return false;
 1900       }
 1901       break;
 1902     case Op_VectorMaskCmp:
 1903       if (vlen < 2 || size_in_bits < 32) {
 1904         return false;
 1905       }
 1906       break;
 1907     case Op_CompressM:
 1908       if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
 1909         return false;
 1910       }
 1911       break;
 1912     case Op_CompressV:
 1913     case Op_ExpandV:
 1914       if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
 1915         return false;
 1916       }
 1917       if (size_in_bits < 128 ) {
 1918         return false;
 1919       }
 1920       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
 1921         return false;
 1922       }
 1923       break;
 1924     case Op_VectorLongToMask:
 1925       if (UseAVX < 1 || !is_LP64) {
 1926         return false;
 1927       }
 1928       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
 1929         return false;
 1930       }
 1931       break;
 1932     case Op_PopCountVI:
 1933     case Op_PopCountVL: {
 1934         if (!is_pop_count_instr_target(bt) &&
 1935             (size_in_bits == 512) && !VM_Version::supports_avx512bw()) {
 1936           return false;
 1937         }
 1938       }
 1939       break;
 1940     case Op_ReverseV:
 1941     case Op_ReverseBytesV:
 1942       if (UseAVX < 2) {
 1943         return false;
 1944       }
 1945       break;
 1946     case Op_CountTrailingZerosV:
 1947     case Op_CountLeadingZerosV:
 1948       if (UseAVX < 2) {
 1949         return false;
 1950       }
 1951       break;
 1952   }
 1953   return true;  // Per default match rules are supported.
 1954 }
 1955 
 1956 const bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
 1957   // ADLC based match_rule_supported routine checks for the existence of pattern based
 1958   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
 1959   // of their non-masked counterpart with mask edge being the differentiator.
 1960   // This routine does a strict check on the existence of masked operation patterns
 1961   // by returning a default false value for all the other opcodes apart from the
 1962   // ones whose masked instruction patterns are defined in this file.
 1963   if (!match_rule_supported_vector(opcode, vlen, bt)) {
 1964     return false;
 1965   }
 1966 
 1967   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
 1968   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
 1969   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
 1970     return false;
 1971   }
 1972   switch(opcode) {
 1973     // Unary masked operations
 1974     case Op_AbsVB:
 1975     case Op_AbsVS:
 1976       if(!VM_Version::supports_avx512bw()) {
 1977         return false;  // Implementation limitation
 1978       }
 1979     case Op_AbsVI:
 1980     case Op_AbsVL:
 1981       return true;
 1982 
 1983     // Ternary masked operations
 1984     case Op_FmaVF:
 1985     case Op_FmaVD:
 1986       return true;
 1987 
 1988     case Op_MacroLogicV:
 1989       if(bt != T_INT && bt != T_LONG) {
 1990         return false;
 1991       }
 1992       return true;
 1993 
 1994     // Binary masked operations
 1995     case Op_AddVB:
 1996     case Op_AddVS:
 1997     case Op_SubVB:
 1998     case Op_SubVS:
 1999     case Op_MulVS:
 2000     case Op_LShiftVS:
 2001     case Op_RShiftVS:
 2002     case Op_URShiftVS:
 2003       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2004       if (!VM_Version::supports_avx512bw()) {
 2005         return false;  // Implementation limitation
 2006       }
 2007       return true;
 2008 
 2009     case Op_MulVL:
 2010       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2011       if (!VM_Version::supports_avx512dq()) {
 2012         return false;  // Implementation limitation
 2013       }
 2014       return true;
 2015 
 2016     case Op_AndV:
 2017     case Op_OrV:
 2018     case Op_XorV:
 2019     case Op_RotateRightV:
 2020     case Op_RotateLeftV:
 2021       if (bt != T_INT && bt != T_LONG) {
 2022         return false; // Implementation limitation
 2023       }
 2024       return true;
 2025 
 2026     case Op_VectorLoadMask:
 2027       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
 2028       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2029         return false;
 2030       }
 2031       return true;
 2032 
 2033     case Op_AddVI:
 2034     case Op_AddVL:
 2035     case Op_AddVF:
 2036     case Op_AddVD:
 2037     case Op_SubVI:
 2038     case Op_SubVL:
 2039     case Op_SubVF:
 2040     case Op_SubVD:
 2041     case Op_MulVI:
 2042     case Op_MulVF:
 2043     case Op_MulVD:
 2044     case Op_DivVF:
 2045     case Op_DivVD:
 2046     case Op_SqrtVF:
 2047     case Op_SqrtVD:
 2048     case Op_LShiftVI:
 2049     case Op_LShiftVL:
 2050     case Op_RShiftVI:
 2051     case Op_RShiftVL:
 2052     case Op_URShiftVI:
 2053     case Op_URShiftVL:
 2054     case Op_LoadVectorMasked:
 2055     case Op_StoreVectorMasked:
 2056     case Op_LoadVectorGatherMasked:
 2057     case Op_StoreVectorScatterMasked:
 2058       return true;
 2059 
 2060     case Op_MaxV:
 2061     case Op_MinV:
 2062       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2063         return false; // Implementation limitation
 2064       }
 2065       if (is_floating_point_type(bt)) {
 2066         return false; // Implementation limitation
 2067       }
 2068       return true;
 2069 
 2070     case Op_VectorMaskCmp:
 2071       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
 2072         return false; // Implementation limitation
 2073       }
 2074       return true;
 2075 
 2076     case Op_VectorRearrange:
 2077       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
 2078         return false; // Implementation limitation
 2079       }
 2080       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
 2081         return false; // Implementation limitation
 2082       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
 2083         return false; // Implementation limitation
 2084       }
 2085       return true;
 2086 
 2087     // Binary Logical operations
 2088     case Op_AndVMask:
 2089     case Op_OrVMask:
 2090     case Op_XorVMask:
 2091       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
 2092         return false; // Implementation limitation
 2093       }
 2094       return true;
 2095 
 2096     case Op_PopCountVI:
 2097     case Op_PopCountVL:
 2098       if (!is_pop_count_instr_target(bt)) {
 2099         return false;
 2100       }
 2101       return true;
 2102 
 2103     case Op_MaskAll:
 2104       return true;
 2105 
 2106     case Op_CountLeadingZerosV:
 2107       if ((bt == T_INT || bt == T_LONG) && VM_Version::supports_avx512cd()) {
 2108         return true;
 2109       }
 2110     default:
 2111       return false;
 2112   }
 2113 }
 2114 
 2115 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
 2116   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
 2117   bool legacy = (generic_opnd->opcode() == LEGVEC);
 2118   if (!VM_Version::supports_avx512vlbwdq() && // KNL
 2119       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
 2120     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
 2121     return new legVecZOper();
 2122   }
 2123   if (legacy) {
 2124     switch (ideal_reg) {
 2125       case Op_VecS: return new legVecSOper();
 2126       case Op_VecD: return new legVecDOper();
 2127       case Op_VecX: return new legVecXOper();
 2128       case Op_VecY: return new legVecYOper();
 2129       case Op_VecZ: return new legVecZOper();
 2130     }
 2131   } else {
 2132     switch (ideal_reg) {
 2133       case Op_VecS: return new vecSOper();
 2134       case Op_VecD: return new vecDOper();
 2135       case Op_VecX: return new vecXOper();
 2136       case Op_VecY: return new vecYOper();
 2137       case Op_VecZ: return new vecZOper();
 2138     }
 2139   }
 2140   ShouldNotReachHere();
 2141   return NULL;
 2142 }
 2143 
 2144 bool Matcher::is_reg2reg_move(MachNode* m) {
 2145   switch (m->rule()) {
 2146     case MoveVec2Leg_rule:
 2147     case MoveLeg2Vec_rule:
 2148     case MoveF2VL_rule:
 2149     case MoveF2LEG_rule:
 2150     case MoveVL2F_rule:
 2151     case MoveLEG2F_rule:
 2152     case MoveD2VL_rule:
 2153     case MoveD2LEG_rule:
 2154     case MoveVL2D_rule:
 2155     case MoveLEG2D_rule:
 2156       return true;
 2157     default:
 2158       return false;
 2159   }
 2160 }
 2161 
 2162 bool Matcher::is_generic_vector(MachOper* opnd) {
 2163   switch (opnd->opcode()) {
 2164     case VEC:
 2165     case LEGVEC:
 2166       return true;
 2167     default:
 2168       return false;
 2169   }
 2170 }
 2171 
 2172 //------------------------------------------------------------------------
 2173 
 2174 const RegMask* Matcher::predicate_reg_mask(void) {
 2175   return &_VECTMASK_REG_mask;
 2176 }
 2177 
 2178 const TypeVectMask* Matcher::predicate_reg_type(const Type* elemTy, int length) {
 2179   return new TypeVectMask(elemTy, length);
 2180 }
 2181 
 2182 // Max vector size in bytes. 0 if not supported.
 2183 const int Matcher::vector_width_in_bytes(BasicType bt) {
 2184   assert(is_java_primitive(bt), "only primitive type vectors");
 2185   if (UseSSE < 2) return 0;
 2186   // SSE2 supports 128bit vectors for all types.
 2187   // AVX2 supports 256bit vectors for all types.
 2188   // AVX2/EVEX supports 512bit vectors for all types.
 2189   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
 2190   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
 2191   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
 2192     size = (UseAVX > 2) ? 64 : 32;
 2193   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
 2194     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
 2195   // Use flag to limit vector size.
 2196   size = MIN2(size,(int)MaxVectorSize);
 2197   // Minimum 2 values in vector (or 4 for bytes).
 2198   switch (bt) {
 2199   case T_DOUBLE:
 2200   case T_LONG:
 2201     if (size < 16) return 0;
 2202     break;
 2203   case T_FLOAT:
 2204   case T_INT:
 2205     if (size < 8) return 0;
 2206     break;
 2207   case T_BOOLEAN:
 2208     if (size < 4) return 0;
 2209     break;
 2210   case T_CHAR:
 2211     if (size < 4) return 0;
 2212     break;
 2213   case T_BYTE:
 2214     if (size < 4) return 0;
 2215     break;
 2216   case T_SHORT:
 2217     if (size < 4) return 0;
 2218     break;
 2219   default:
 2220     ShouldNotReachHere();
 2221   }
 2222   return size;
 2223 }
 2224 
 2225 // Limits on vector size (number of elements) loaded into vector.
 2226 const int Matcher::max_vector_size(const BasicType bt) {
 2227   return vector_width_in_bytes(bt)/type2aelembytes(bt);
 2228 }
 2229 const int Matcher::min_vector_size(const BasicType bt) {
 2230   int max_size = max_vector_size(bt);
 2231   // Min size which can be loaded into vector is 4 bytes.
 2232   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
 2233   // Support for calling svml double64 vectors
 2234   if (bt == T_DOUBLE) {
 2235     size = 1;
 2236   }
 2237   return MIN2(size,max_size);
 2238 }
 2239 
 2240 const int Matcher::scalable_vector_reg_size(const BasicType bt) {
 2241   return -1;
 2242 }
 2243 
 2244 // Vector ideal reg corresponding to specified size in bytes
 2245 const uint Matcher::vector_ideal_reg(int size) {
 2246   assert(MaxVectorSize >= size, "");
 2247   switch(size) {
 2248     case  4: return Op_VecS;
 2249     case  8: return Op_VecD;
 2250     case 16: return Op_VecX;
 2251     case 32: return Op_VecY;
 2252     case 64: return Op_VecZ;
 2253   }
 2254   ShouldNotReachHere();
 2255   return 0;
 2256 }
 2257 
 2258 // Check for shift by small constant as well
 2259 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
 2260   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
 2261       shift->in(2)->get_int() <= 3 &&
 2262       // Are there other uses besides address expressions?
 2263       !matcher->is_visited(shift)) {
 2264     address_visited.set(shift->_idx); // Flag as address_visited
 2265     mstack.push(shift->in(2), Matcher::Visit);
 2266     Node *conv = shift->in(1);
 2267 #ifdef _LP64
 2268     // Allow Matcher to match the rule which bypass
 2269     // ConvI2L operation for an array index on LP64
 2270     // if the index value is positive.
 2271     if (conv->Opcode() == Op_ConvI2L &&
 2272         conv->as_Type()->type()->is_long()->_lo >= 0 &&
 2273         // Are there other uses besides address expressions?
 2274         !matcher->is_visited(conv)) {
 2275       address_visited.set(conv->_idx); // Flag as address_visited
 2276       mstack.push(conv->in(1), Matcher::Pre_Visit);
 2277     } else
 2278 #endif
 2279       mstack.push(conv, Matcher::Pre_Visit);
 2280     return true;
 2281   }
 2282   return false;
 2283 }
 2284 
 2285 // This function identifies sub-graphs in which a 'load' node is
 2286 // input to two different nodes, and such that it can be matched
 2287 // with BMI instructions like blsi, blsr, etc.
 2288 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
 2289 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
 2290 // refers to the same node.
 2291 //
 2292 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
 2293 // This is a temporary solution until we make DAGs expressible in ADL.
 2294 template<typename ConType>
 2295 class FusedPatternMatcher {
 2296   Node* _op1_node;
 2297   Node* _mop_node;
 2298   int _con_op;
 2299 
 2300   static int match_next(Node* n, int next_op, int next_op_idx) {
 2301     if (n->in(1) == NULL || n->in(2) == NULL) {
 2302       return -1;
 2303     }
 2304 
 2305     if (next_op_idx == -1) { // n is commutative, try rotations
 2306       if (n->in(1)->Opcode() == next_op) {
 2307         return 1;
 2308       } else if (n->in(2)->Opcode() == next_op) {
 2309         return 2;
 2310       }
 2311     } else {
 2312       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
 2313       if (n->in(next_op_idx)->Opcode() == next_op) {
 2314         return next_op_idx;
 2315       }
 2316     }
 2317     return -1;
 2318   }
 2319 
 2320  public:
 2321   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
 2322     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
 2323 
 2324   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
 2325              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
 2326              typename ConType::NativeType con_value) {
 2327     if (_op1_node->Opcode() != op1) {
 2328       return false;
 2329     }
 2330     if (_mop_node->outcnt() > 2) {
 2331       return false;
 2332     }
 2333     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
 2334     if (op1_op2_idx == -1) {
 2335       return false;
 2336     }
 2337     // Memory operation must be the other edge
 2338     int op1_mop_idx = (op1_op2_idx & 1) + 1;
 2339 
 2340     // Check that the mop node is really what we want
 2341     if (_op1_node->in(op1_mop_idx) == _mop_node) {
 2342       Node* op2_node = _op1_node->in(op1_op2_idx);
 2343       if (op2_node->outcnt() > 1) {
 2344         return false;
 2345       }
 2346       assert(op2_node->Opcode() == op2, "Should be");
 2347       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
 2348       if (op2_con_idx == -1) {
 2349         return false;
 2350       }
 2351       // Memory operation must be the other edge
 2352       int op2_mop_idx = (op2_con_idx & 1) + 1;
 2353       // Check that the memory operation is the same node
 2354       if (op2_node->in(op2_mop_idx) == _mop_node) {
 2355         // Now check the constant
 2356         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
 2357         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
 2358           return true;
 2359         }
 2360       }
 2361     }
 2362     return false;
 2363   }
 2364 };
 2365 
 2366 static bool is_bmi_pattern(Node* n, Node* m) {
 2367   assert(UseBMI1Instructions, "sanity");
 2368   if (n != NULL && m != NULL) {
 2369     if (m->Opcode() == Op_LoadI) {
 2370       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
 2371       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
 2372              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
 2373              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
 2374     } else if (m->Opcode() == Op_LoadL) {
 2375       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
 2376       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
 2377              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
 2378              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
 2379     }
 2380   }
 2381   return false;
 2382 }
 2383 
 2384 // Should the matcher clone input 'm' of node 'n'?
 2385 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 2386   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
 2387   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
 2388     mstack.push(m, Visit);
 2389     return true;
 2390   }
 2391   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
 2392     mstack.push(m, Visit);           // m = ShiftCntV
 2393     return true;
 2394   }
 2395   return false;
 2396 }
 2397 
 2398 // Should the Matcher clone shifts on addressing modes, expecting them
 2399 // to be subsumed into complex addressing expressions or compute them
 2400 // into registers?
 2401 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 2402   Node *off = m->in(AddPNode::Offset);
 2403   if (off->is_Con()) {
 2404     address_visited.test_set(m->_idx); // Flag as address_visited
 2405     Node *adr = m->in(AddPNode::Address);
 2406 
 2407     // Intel can handle 2 adds in addressing mode
 2408     // AtomicAdd is not an addressing expression.
 2409     // Cheap to find it by looking for screwy base.
 2410     if (adr->is_AddP() &&
 2411         !adr->in(AddPNode::Base)->is_top() &&
 2412         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
 2413         // Are there other uses besides address expressions?
 2414         !is_visited(adr)) {
 2415       address_visited.set(adr->_idx); // Flag as address_visited
 2416       Node *shift = adr->in(AddPNode::Offset);
 2417       if (!clone_shift(shift, this, mstack, address_visited)) {
 2418         mstack.push(shift, Pre_Visit);
 2419       }
 2420       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
 2421       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
 2422     } else {
 2423       mstack.push(adr, Pre_Visit);
 2424     }
 2425 
 2426     // Clone X+offset as it also folds into most addressing expressions
 2427     mstack.push(off, Visit);
 2428     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2429     return true;
 2430   } else if (clone_shift(off, this, mstack, address_visited)) {
 2431     address_visited.test_set(m->_idx); // Flag as address_visited
 2432     mstack.push(m->in(AddPNode::Address), Pre_Visit);
 2433     mstack.push(m->in(AddPNode::Base), Pre_Visit);
 2434     return true;
 2435   }
 2436   return false;
 2437 }
 2438 
 2439 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
 2440   switch (bt) {
 2441     case BoolTest::eq:
 2442       return Assembler::eq;
 2443     case BoolTest::ne:
 2444       return Assembler::neq;
 2445     case BoolTest::le:
 2446     case BoolTest::ule:
 2447       return Assembler::le;
 2448     case BoolTest::ge:
 2449     case BoolTest::uge:
 2450       return Assembler::nlt;
 2451     case BoolTest::lt:
 2452     case BoolTest::ult:
 2453       return Assembler::lt;
 2454     case BoolTest::gt:
 2455     case BoolTest::ugt:
 2456       return Assembler::nle;
 2457     default : ShouldNotReachHere(); return Assembler::_false;
 2458   }
 2459 }
 2460 
 2461 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
 2462   switch (bt) {
 2463   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
 2464   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
 2465   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
 2466   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
 2467   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
 2468   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
 2469   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
 2470   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
 2471   }
 2472 }
 2473 
 2474 // Helper methods for MachSpillCopyNode::implementation().
 2475 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
 2476                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
 2477   assert(ireg == Op_VecS || // 32bit vector
 2478          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
 2479          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
 2480          "no non-adjacent vector moves" );
 2481   if (cbuf) {
 2482     C2_MacroAssembler _masm(cbuf);
 2483     switch (ireg) {
 2484     case Op_VecS: // copy whole register
 2485     case Op_VecD:
 2486     case Op_VecX:
 2487 #ifndef _LP64
 2488       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2489 #else
 2490       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2491         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2492       } else {
 2493         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2494      }
 2495 #endif
 2496       break;
 2497     case Op_VecY:
 2498 #ifndef _LP64
 2499       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2500 #else
 2501       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2502         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
 2503       } else {
 2504         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
 2505      }
 2506 #endif
 2507       break;
 2508     case Op_VecZ:
 2509       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
 2510       break;
 2511     default:
 2512       ShouldNotReachHere();
 2513     }
 2514 #ifndef PRODUCT
 2515   } else {
 2516     switch (ireg) {
 2517     case Op_VecS:
 2518     case Op_VecD:
 2519     case Op_VecX:
 2520       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2521       break;
 2522     case Op_VecY:
 2523     case Op_VecZ:
 2524       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
 2525       break;
 2526     default:
 2527       ShouldNotReachHere();
 2528     }
 2529 #endif
 2530   }
 2531 }
 2532 
 2533 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
 2534                      int stack_offset, int reg, uint ireg, outputStream* st) {
 2535   if (cbuf) {
 2536     C2_MacroAssembler _masm(cbuf);
 2537     if (is_load) {
 2538       switch (ireg) {
 2539       case Op_VecS:
 2540         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2541         break;
 2542       case Op_VecD:
 2543         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2544         break;
 2545       case Op_VecX:
 2546 #ifndef _LP64
 2547         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2548 #else
 2549         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2550           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2551         } else {
 2552           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2553           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2554         }
 2555 #endif
 2556         break;
 2557       case Op_VecY:
 2558 #ifndef _LP64
 2559         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2560 #else
 2561         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2562           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
 2563         } else {
 2564           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2565           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
 2566         }
 2567 #endif
 2568         break;
 2569       case Op_VecZ:
 2570         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
 2571         break;
 2572       default:
 2573         ShouldNotReachHere();
 2574       }
 2575     } else { // store
 2576       switch (ireg) {
 2577       case Op_VecS:
 2578         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2579         break;
 2580       case Op_VecD:
 2581         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2582         break;
 2583       case Op_VecX:
 2584 #ifndef _LP64
 2585         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2586 #else
 2587         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2588           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2589         }
 2590         else {
 2591           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2592         }
 2593 #endif
 2594         break;
 2595       case Op_VecY:
 2596 #ifndef _LP64
 2597         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2598 #else
 2599         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
 2600           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
 2601         }
 2602         else {
 2603           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
 2604         }
 2605 #endif
 2606         break;
 2607       case Op_VecZ:
 2608         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
 2609         break;
 2610       default:
 2611         ShouldNotReachHere();
 2612       }
 2613     }
 2614 #ifndef PRODUCT
 2615   } else {
 2616     if (is_load) {
 2617       switch (ireg) {
 2618       case Op_VecS:
 2619         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2620         break;
 2621       case Op_VecD:
 2622         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2623         break;
 2624        case Op_VecX:
 2625         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2626         break;
 2627       case Op_VecY:
 2628       case Op_VecZ:
 2629         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
 2630         break;
 2631       default:
 2632         ShouldNotReachHere();
 2633       }
 2634     } else { // store
 2635       switch (ireg) {
 2636       case Op_VecS:
 2637         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2638         break;
 2639       case Op_VecD:
 2640         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2641         break;
 2642        case Op_VecX:
 2643         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2644         break;
 2645       case Op_VecY:
 2646       case Op_VecZ:
 2647         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
 2648         break;
 2649       default:
 2650         ShouldNotReachHere();
 2651       }
 2652     }
 2653 #endif
 2654   }
 2655 }
 2656 
 2657 template <class T>
 2658 static inline GrowableArray<jvalue>* vreplicate_imm(BasicType bt, T con, int len) {
 2659   GrowableArray<jvalue>* val = new GrowableArray<jvalue>(len);
 2660   jvalue ele;
 2661   switch (bt) {
 2662     case T_BYTE:   ele.b = con; break;
 2663     case T_SHORT:  ele.s = con; break;
 2664     case T_INT:    ele.i = con; break;
 2665     case T_LONG:   ele.j = con; break;
 2666     case T_FLOAT:  ele.f = con; break;
 2667     case T_DOUBLE: ele.d = con; break;
 2668     default: ShouldNotReachHere();
 2669   }
 2670   for (int i = 0; i < len; i++) {
 2671     val->append(ele);
 2672   }
 2673   return val;
 2674 }
 2675 
 2676 static inline jlong high_bit_set(BasicType bt) {
 2677   switch (bt) {
 2678     case T_BYTE:  return 0x8080808080808080;
 2679     case T_SHORT: return 0x8000800080008000;
 2680     case T_INT:   return 0x8000000080000000;
 2681     case T_LONG:  return 0x8000000000000000;
 2682     default:
 2683       ShouldNotReachHere();
 2684       return 0;
 2685   }
 2686 }
 2687 
 2688 #ifndef PRODUCT
 2689   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
 2690     st->print("nop \t# %d bytes pad for loops and calls", _count);
 2691   }
 2692 #endif
 2693 
 2694   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
 2695     C2_MacroAssembler _masm(&cbuf);
 2696     __ nop(_count);
 2697   }
 2698 
 2699   uint MachNopNode::size(PhaseRegAlloc*) const {
 2700     return _count;
 2701   }
 2702 
 2703 #ifndef PRODUCT
 2704   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
 2705     st->print("# breakpoint");
 2706   }
 2707 #endif
 2708 
 2709   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
 2710     C2_MacroAssembler _masm(&cbuf);
 2711     __ int3();
 2712   }
 2713 
 2714   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
 2715     return MachNode::size(ra_);
 2716   }
 2717 
 2718 %}
 2719 
 2720 encode %{
 2721 
 2722   enc_class call_epilog %{
 2723     if (VerifyStackAtCalls) {
 2724       // Check that stack depth is unchanged: find majik cookie on stack
 2725       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
 2726       C2_MacroAssembler _masm(&cbuf);
 2727       Label L;
 2728       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
 2729       __ jccb(Assembler::equal, L);
 2730       // Die if stack mismatch
 2731       __ int3();
 2732       __ bind(L);
 2733     }
 2734   %}
 2735 
 2736 %}
 2737 
 2738 // Operands for bound floating pointer register arguments
 2739 operand rxmm0() %{
 2740   constraint(ALLOC_IN_RC(xmm0_reg));
 2741   match(VecX);
 2742   format%{%}
 2743   interface(REG_INTER);
 2744 %}
 2745 
 2746 //----------OPERANDS-----------------------------------------------------------
 2747 // Operand definitions must precede instruction definitions for correct parsing
 2748 // in the ADLC because operands constitute user defined types which are used in
 2749 // instruction definitions.
 2750 
 2751 // Vectors
 2752 
 2753 // Dummy generic vector class. Should be used for all vector operands.
 2754 // Replaced with vec[SDXYZ] during post-selection pass.
 2755 operand vec() %{
 2756   constraint(ALLOC_IN_RC(dynamic));
 2757   match(VecX);
 2758   match(VecY);
 2759   match(VecZ);
 2760   match(VecS);
 2761   match(VecD);
 2762 
 2763   format %{ %}
 2764   interface(REG_INTER);
 2765 %}
 2766 
 2767 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
 2768 // Replaced with legVec[SDXYZ] during post-selection cleanup.
 2769 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
 2770 // runtime code generation via reg_class_dynamic.
 2771 operand legVec() %{
 2772   constraint(ALLOC_IN_RC(dynamic));
 2773   match(VecX);
 2774   match(VecY);
 2775   match(VecZ);
 2776   match(VecS);
 2777   match(VecD);
 2778 
 2779   format %{ %}
 2780   interface(REG_INTER);
 2781 %}
 2782 
 2783 // Replaces vec during post-selection cleanup. See above.
 2784 operand vecS() %{
 2785   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
 2786   match(VecS);
 2787 
 2788   format %{ %}
 2789   interface(REG_INTER);
 2790 %}
 2791 
 2792 // Replaces legVec during post-selection cleanup. See above.
 2793 operand legVecS() %{
 2794   constraint(ALLOC_IN_RC(vectors_reg_legacy));
 2795   match(VecS);
 2796 
 2797   format %{ %}
 2798   interface(REG_INTER);
 2799 %}
 2800 
 2801 // Replaces vec during post-selection cleanup. See above.
 2802 operand vecD() %{
 2803   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
 2804   match(VecD);
 2805 
 2806   format %{ %}
 2807   interface(REG_INTER);
 2808 %}
 2809 
 2810 // Replaces legVec during post-selection cleanup. See above.
 2811 operand legVecD() %{
 2812   constraint(ALLOC_IN_RC(vectord_reg_legacy));
 2813   match(VecD);
 2814 
 2815   format %{ %}
 2816   interface(REG_INTER);
 2817 %}
 2818 
 2819 // Replaces vec during post-selection cleanup. See above.
 2820 operand vecX() %{
 2821   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
 2822   match(VecX);
 2823 
 2824   format %{ %}
 2825   interface(REG_INTER);
 2826 %}
 2827 
 2828 // Replaces legVec during post-selection cleanup. See above.
 2829 operand legVecX() %{
 2830   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
 2831   match(VecX);
 2832 
 2833   format %{ %}
 2834   interface(REG_INTER);
 2835 %}
 2836 
 2837 // Replaces vec during post-selection cleanup. See above.
 2838 operand vecY() %{
 2839   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
 2840   match(VecY);
 2841 
 2842   format %{ %}
 2843   interface(REG_INTER);
 2844 %}
 2845 
 2846 // Replaces legVec during post-selection cleanup. See above.
 2847 operand legVecY() %{
 2848   constraint(ALLOC_IN_RC(vectory_reg_legacy));
 2849   match(VecY);
 2850 
 2851   format %{ %}
 2852   interface(REG_INTER);
 2853 %}
 2854 
 2855 // Replaces vec during post-selection cleanup. See above.
 2856 operand vecZ() %{
 2857   constraint(ALLOC_IN_RC(vectorz_reg));
 2858   match(VecZ);
 2859 
 2860   format %{ %}
 2861   interface(REG_INTER);
 2862 %}
 2863 
 2864 // Replaces legVec during post-selection cleanup. See above.
 2865 operand legVecZ() %{
 2866   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
 2867   match(VecZ);
 2868 
 2869   format %{ %}
 2870   interface(REG_INTER);
 2871 %}
 2872 
 2873 // Comparison Code for FP conditional move
 2874 operand cmpOp_vcmppd() %{
 2875   match(Bool);
 2876 
 2877   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
 2878             n->as_Bool()->_test._test != BoolTest::no_overflow);
 2879   format %{ "" %}
 2880   interface(COND_INTER) %{
 2881     equal        (0x0, "eq");
 2882     less         (0x1, "lt");
 2883     less_equal   (0x2, "le");
 2884     not_equal    (0xC, "ne");
 2885     greater_equal(0xD, "ge");
 2886     greater      (0xE, "gt");
 2887     //TODO cannot compile (adlc breaks) without two next lines with error:
 2888     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
 2889     // equal' for overflow.
 2890     overflow     (0x20, "o");  // not really supported by the instruction
 2891     no_overflow  (0x21, "no"); // not really supported by the instruction
 2892   %}
 2893 %}
 2894 
 2895 
 2896 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
 2897 
 2898 // ============================================================================
 2899 
 2900 instruct ShouldNotReachHere() %{
 2901   match(Halt);
 2902   format %{ "stop\t# ShouldNotReachHere" %}
 2903   ins_encode %{
 2904     if (is_reachable()) {
 2905       __ stop(_halt_reason);
 2906     }
 2907   %}
 2908   ins_pipe(pipe_slow);
 2909 %}
 2910 
 2911 // ============================================================================
 2912 
 2913 instruct addF_reg(regF dst, regF src) %{
 2914   predicate((UseSSE>=1) && (UseAVX == 0));
 2915   match(Set dst (AddF dst src));
 2916 
 2917   format %{ "addss   $dst, $src" %}
 2918   ins_cost(150);
 2919   ins_encode %{
 2920     __ addss($dst$$XMMRegister, $src$$XMMRegister);
 2921   %}
 2922   ins_pipe(pipe_slow);
 2923 %}
 2924 
 2925 instruct addF_mem(regF dst, memory src) %{
 2926   predicate((UseSSE>=1) && (UseAVX == 0));
 2927   match(Set dst (AddF dst (LoadF src)));
 2928 
 2929   format %{ "addss   $dst, $src" %}
 2930   ins_cost(150);
 2931   ins_encode %{
 2932     __ addss($dst$$XMMRegister, $src$$Address);
 2933   %}
 2934   ins_pipe(pipe_slow);
 2935 %}
 2936 
 2937 instruct addF_imm(regF dst, immF con) %{
 2938   predicate((UseSSE>=1) && (UseAVX == 0));
 2939   match(Set dst (AddF dst con));
 2940   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 2941   ins_cost(150);
 2942   ins_encode %{
 2943     __ addss($dst$$XMMRegister, $constantaddress($con));
 2944   %}
 2945   ins_pipe(pipe_slow);
 2946 %}
 2947 
 2948 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
 2949   predicate(UseAVX > 0);
 2950   match(Set dst (AddF src1 src2));
 2951 
 2952   format %{ "vaddss  $dst, $src1, $src2" %}
 2953   ins_cost(150);
 2954   ins_encode %{
 2955     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 2956   %}
 2957   ins_pipe(pipe_slow);
 2958 %}
 2959 
 2960 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
 2961   predicate(UseAVX > 0);
 2962   match(Set dst (AddF src1 (LoadF src2)));
 2963 
 2964   format %{ "vaddss  $dst, $src1, $src2" %}
 2965   ins_cost(150);
 2966   ins_encode %{
 2967     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 2968   %}
 2969   ins_pipe(pipe_slow);
 2970 %}
 2971 
 2972 instruct addF_reg_imm(regF dst, regF src, immF con) %{
 2973   predicate(UseAVX > 0);
 2974   match(Set dst (AddF src con));
 2975 
 2976   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 2977   ins_cost(150);
 2978   ins_encode %{
 2979     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 2980   %}
 2981   ins_pipe(pipe_slow);
 2982 %}
 2983 
 2984 instruct addD_reg(regD dst, regD src) %{
 2985   predicate((UseSSE>=2) && (UseAVX == 0));
 2986   match(Set dst (AddD dst src));
 2987 
 2988   format %{ "addsd   $dst, $src" %}
 2989   ins_cost(150);
 2990   ins_encode %{
 2991     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
 2992   %}
 2993   ins_pipe(pipe_slow);
 2994 %}
 2995 
 2996 instruct addD_mem(regD dst, memory src) %{
 2997   predicate((UseSSE>=2) && (UseAVX == 0));
 2998   match(Set dst (AddD dst (LoadD src)));
 2999 
 3000   format %{ "addsd   $dst, $src" %}
 3001   ins_cost(150);
 3002   ins_encode %{
 3003     __ addsd($dst$$XMMRegister, $src$$Address);
 3004   %}
 3005   ins_pipe(pipe_slow);
 3006 %}
 3007 
 3008 instruct addD_imm(regD dst, immD con) %{
 3009   predicate((UseSSE>=2) && (UseAVX == 0));
 3010   match(Set dst (AddD dst con));
 3011   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3012   ins_cost(150);
 3013   ins_encode %{
 3014     __ addsd($dst$$XMMRegister, $constantaddress($con));
 3015   %}
 3016   ins_pipe(pipe_slow);
 3017 %}
 3018 
 3019 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
 3020   predicate(UseAVX > 0);
 3021   match(Set dst (AddD src1 src2));
 3022 
 3023   format %{ "vaddsd  $dst, $src1, $src2" %}
 3024   ins_cost(150);
 3025   ins_encode %{
 3026     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3027   %}
 3028   ins_pipe(pipe_slow);
 3029 %}
 3030 
 3031 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
 3032   predicate(UseAVX > 0);
 3033   match(Set dst (AddD src1 (LoadD src2)));
 3034 
 3035   format %{ "vaddsd  $dst, $src1, $src2" %}
 3036   ins_cost(150);
 3037   ins_encode %{
 3038     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3039   %}
 3040   ins_pipe(pipe_slow);
 3041 %}
 3042 
 3043 instruct addD_reg_imm(regD dst, regD src, immD con) %{
 3044   predicate(UseAVX > 0);
 3045   match(Set dst (AddD src con));
 3046 
 3047   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3048   ins_cost(150);
 3049   ins_encode %{
 3050     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3051   %}
 3052   ins_pipe(pipe_slow);
 3053 %}
 3054 
 3055 instruct subF_reg(regF dst, regF src) %{
 3056   predicate((UseSSE>=1) && (UseAVX == 0));
 3057   match(Set dst (SubF dst src));
 3058 
 3059   format %{ "subss   $dst, $src" %}
 3060   ins_cost(150);
 3061   ins_encode %{
 3062     __ subss($dst$$XMMRegister, $src$$XMMRegister);
 3063   %}
 3064   ins_pipe(pipe_slow);
 3065 %}
 3066 
 3067 instruct subF_mem(regF dst, memory src) %{
 3068   predicate((UseSSE>=1) && (UseAVX == 0));
 3069   match(Set dst (SubF dst (LoadF src)));
 3070 
 3071   format %{ "subss   $dst, $src" %}
 3072   ins_cost(150);
 3073   ins_encode %{
 3074     __ subss($dst$$XMMRegister, $src$$Address);
 3075   %}
 3076   ins_pipe(pipe_slow);
 3077 %}
 3078 
 3079 instruct subF_imm(regF dst, immF con) %{
 3080   predicate((UseSSE>=1) && (UseAVX == 0));
 3081   match(Set dst (SubF dst con));
 3082   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3083   ins_cost(150);
 3084   ins_encode %{
 3085     __ subss($dst$$XMMRegister, $constantaddress($con));
 3086   %}
 3087   ins_pipe(pipe_slow);
 3088 %}
 3089 
 3090 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
 3091   predicate(UseAVX > 0);
 3092   match(Set dst (SubF src1 src2));
 3093 
 3094   format %{ "vsubss  $dst, $src1, $src2" %}
 3095   ins_cost(150);
 3096   ins_encode %{
 3097     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3098   %}
 3099   ins_pipe(pipe_slow);
 3100 %}
 3101 
 3102 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
 3103   predicate(UseAVX > 0);
 3104   match(Set dst (SubF src1 (LoadF src2)));
 3105 
 3106   format %{ "vsubss  $dst, $src1, $src2" %}
 3107   ins_cost(150);
 3108   ins_encode %{
 3109     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3110   %}
 3111   ins_pipe(pipe_slow);
 3112 %}
 3113 
 3114 instruct subF_reg_imm(regF dst, regF src, immF con) %{
 3115   predicate(UseAVX > 0);
 3116   match(Set dst (SubF src con));
 3117 
 3118   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3119   ins_cost(150);
 3120   ins_encode %{
 3121     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3122   %}
 3123   ins_pipe(pipe_slow);
 3124 %}
 3125 
 3126 instruct subD_reg(regD dst, regD src) %{
 3127   predicate((UseSSE>=2) && (UseAVX == 0));
 3128   match(Set dst (SubD dst src));
 3129 
 3130   format %{ "subsd   $dst, $src" %}
 3131   ins_cost(150);
 3132   ins_encode %{
 3133     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
 3134   %}
 3135   ins_pipe(pipe_slow);
 3136 %}
 3137 
 3138 instruct subD_mem(regD dst, memory src) %{
 3139   predicate((UseSSE>=2) && (UseAVX == 0));
 3140   match(Set dst (SubD dst (LoadD src)));
 3141 
 3142   format %{ "subsd   $dst, $src" %}
 3143   ins_cost(150);
 3144   ins_encode %{
 3145     __ subsd($dst$$XMMRegister, $src$$Address);
 3146   %}
 3147   ins_pipe(pipe_slow);
 3148 %}
 3149 
 3150 instruct subD_imm(regD dst, immD con) %{
 3151   predicate((UseSSE>=2) && (UseAVX == 0));
 3152   match(Set dst (SubD dst con));
 3153   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3154   ins_cost(150);
 3155   ins_encode %{
 3156     __ subsd($dst$$XMMRegister, $constantaddress($con));
 3157   %}
 3158   ins_pipe(pipe_slow);
 3159 %}
 3160 
 3161 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
 3162   predicate(UseAVX > 0);
 3163   match(Set dst (SubD src1 src2));
 3164 
 3165   format %{ "vsubsd  $dst, $src1, $src2" %}
 3166   ins_cost(150);
 3167   ins_encode %{
 3168     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3169   %}
 3170   ins_pipe(pipe_slow);
 3171 %}
 3172 
 3173 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
 3174   predicate(UseAVX > 0);
 3175   match(Set dst (SubD src1 (LoadD src2)));
 3176 
 3177   format %{ "vsubsd  $dst, $src1, $src2" %}
 3178   ins_cost(150);
 3179   ins_encode %{
 3180     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3181   %}
 3182   ins_pipe(pipe_slow);
 3183 %}
 3184 
 3185 instruct subD_reg_imm(regD dst, regD src, immD con) %{
 3186   predicate(UseAVX > 0);
 3187   match(Set dst (SubD src con));
 3188 
 3189   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3190   ins_cost(150);
 3191   ins_encode %{
 3192     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3193   %}
 3194   ins_pipe(pipe_slow);
 3195 %}
 3196 
 3197 instruct mulF_reg(regF dst, regF src) %{
 3198   predicate((UseSSE>=1) && (UseAVX == 0));
 3199   match(Set dst (MulF dst src));
 3200 
 3201   format %{ "mulss   $dst, $src" %}
 3202   ins_cost(150);
 3203   ins_encode %{
 3204     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
 3205   %}
 3206   ins_pipe(pipe_slow);
 3207 %}
 3208 
 3209 instruct mulF_mem(regF dst, memory src) %{
 3210   predicate((UseSSE>=1) && (UseAVX == 0));
 3211   match(Set dst (MulF dst (LoadF src)));
 3212 
 3213   format %{ "mulss   $dst, $src" %}
 3214   ins_cost(150);
 3215   ins_encode %{
 3216     __ mulss($dst$$XMMRegister, $src$$Address);
 3217   %}
 3218   ins_pipe(pipe_slow);
 3219 %}
 3220 
 3221 instruct mulF_imm(regF dst, immF con) %{
 3222   predicate((UseSSE>=1) && (UseAVX == 0));
 3223   match(Set dst (MulF dst con));
 3224   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3225   ins_cost(150);
 3226   ins_encode %{
 3227     __ mulss($dst$$XMMRegister, $constantaddress($con));
 3228   %}
 3229   ins_pipe(pipe_slow);
 3230 %}
 3231 
 3232 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
 3233   predicate(UseAVX > 0);
 3234   match(Set dst (MulF src1 src2));
 3235 
 3236   format %{ "vmulss  $dst, $src1, $src2" %}
 3237   ins_cost(150);
 3238   ins_encode %{
 3239     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3240   %}
 3241   ins_pipe(pipe_slow);
 3242 %}
 3243 
 3244 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
 3245   predicate(UseAVX > 0);
 3246   match(Set dst (MulF src1 (LoadF src2)));
 3247 
 3248   format %{ "vmulss  $dst, $src1, $src2" %}
 3249   ins_cost(150);
 3250   ins_encode %{
 3251     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3252   %}
 3253   ins_pipe(pipe_slow);
 3254 %}
 3255 
 3256 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
 3257   predicate(UseAVX > 0);
 3258   match(Set dst (MulF src con));
 3259 
 3260   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3261   ins_cost(150);
 3262   ins_encode %{
 3263     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3264   %}
 3265   ins_pipe(pipe_slow);
 3266 %}
 3267 
 3268 instruct mulD_reg(regD dst, regD src) %{
 3269   predicate((UseSSE>=2) && (UseAVX == 0));
 3270   match(Set dst (MulD dst src));
 3271 
 3272   format %{ "mulsd   $dst, $src" %}
 3273   ins_cost(150);
 3274   ins_encode %{
 3275     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
 3276   %}
 3277   ins_pipe(pipe_slow);
 3278 %}
 3279 
 3280 instruct mulD_mem(regD dst, memory src) %{
 3281   predicate((UseSSE>=2) && (UseAVX == 0));
 3282   match(Set dst (MulD dst (LoadD src)));
 3283 
 3284   format %{ "mulsd   $dst, $src" %}
 3285   ins_cost(150);
 3286   ins_encode %{
 3287     __ mulsd($dst$$XMMRegister, $src$$Address);
 3288   %}
 3289   ins_pipe(pipe_slow);
 3290 %}
 3291 
 3292 instruct mulD_imm(regD dst, immD con) %{
 3293   predicate((UseSSE>=2) && (UseAVX == 0));
 3294   match(Set dst (MulD dst con));
 3295   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3296   ins_cost(150);
 3297   ins_encode %{
 3298     __ mulsd($dst$$XMMRegister, $constantaddress($con));
 3299   %}
 3300   ins_pipe(pipe_slow);
 3301 %}
 3302 
 3303 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
 3304   predicate(UseAVX > 0);
 3305   match(Set dst (MulD src1 src2));
 3306 
 3307   format %{ "vmulsd  $dst, $src1, $src2" %}
 3308   ins_cost(150);
 3309   ins_encode %{
 3310     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3311   %}
 3312   ins_pipe(pipe_slow);
 3313 %}
 3314 
 3315 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
 3316   predicate(UseAVX > 0);
 3317   match(Set dst (MulD src1 (LoadD src2)));
 3318 
 3319   format %{ "vmulsd  $dst, $src1, $src2" %}
 3320   ins_cost(150);
 3321   ins_encode %{
 3322     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3323   %}
 3324   ins_pipe(pipe_slow);
 3325 %}
 3326 
 3327 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
 3328   predicate(UseAVX > 0);
 3329   match(Set dst (MulD src con));
 3330 
 3331   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3332   ins_cost(150);
 3333   ins_encode %{
 3334     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3335   %}
 3336   ins_pipe(pipe_slow);
 3337 %}
 3338 
 3339 instruct divF_reg(regF dst, regF src) %{
 3340   predicate((UseSSE>=1) && (UseAVX == 0));
 3341   match(Set dst (DivF dst src));
 3342 
 3343   format %{ "divss   $dst, $src" %}
 3344   ins_cost(150);
 3345   ins_encode %{
 3346     __ divss($dst$$XMMRegister, $src$$XMMRegister);
 3347   %}
 3348   ins_pipe(pipe_slow);
 3349 %}
 3350 
 3351 instruct divF_mem(regF dst, memory src) %{
 3352   predicate((UseSSE>=1) && (UseAVX == 0));
 3353   match(Set dst (DivF dst (LoadF src)));
 3354 
 3355   format %{ "divss   $dst, $src" %}
 3356   ins_cost(150);
 3357   ins_encode %{
 3358     __ divss($dst$$XMMRegister, $src$$Address);
 3359   %}
 3360   ins_pipe(pipe_slow);
 3361 %}
 3362 
 3363 instruct divF_imm(regF dst, immF con) %{
 3364   predicate((UseSSE>=1) && (UseAVX == 0));
 3365   match(Set dst (DivF dst con));
 3366   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
 3367   ins_cost(150);
 3368   ins_encode %{
 3369     __ divss($dst$$XMMRegister, $constantaddress($con));
 3370   %}
 3371   ins_pipe(pipe_slow);
 3372 %}
 3373 
 3374 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
 3375   predicate(UseAVX > 0);
 3376   match(Set dst (DivF src1 src2));
 3377 
 3378   format %{ "vdivss  $dst, $src1, $src2" %}
 3379   ins_cost(150);
 3380   ins_encode %{
 3381     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3382   %}
 3383   ins_pipe(pipe_slow);
 3384 %}
 3385 
 3386 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
 3387   predicate(UseAVX > 0);
 3388   match(Set dst (DivF src1 (LoadF src2)));
 3389 
 3390   format %{ "vdivss  $dst, $src1, $src2" %}
 3391   ins_cost(150);
 3392   ins_encode %{
 3393     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3394   %}
 3395   ins_pipe(pipe_slow);
 3396 %}
 3397 
 3398 instruct divF_reg_imm(regF dst, regF src, immF con) %{
 3399   predicate(UseAVX > 0);
 3400   match(Set dst (DivF src con));
 3401 
 3402   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
 3403   ins_cost(150);
 3404   ins_encode %{
 3405     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3406   %}
 3407   ins_pipe(pipe_slow);
 3408 %}
 3409 
 3410 instruct divD_reg(regD dst, regD src) %{
 3411   predicate((UseSSE>=2) && (UseAVX == 0));
 3412   match(Set dst (DivD dst src));
 3413 
 3414   format %{ "divsd   $dst, $src" %}
 3415   ins_cost(150);
 3416   ins_encode %{
 3417     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
 3418   %}
 3419   ins_pipe(pipe_slow);
 3420 %}
 3421 
 3422 instruct divD_mem(regD dst, memory src) %{
 3423   predicate((UseSSE>=2) && (UseAVX == 0));
 3424   match(Set dst (DivD dst (LoadD src)));
 3425 
 3426   format %{ "divsd   $dst, $src" %}
 3427   ins_cost(150);
 3428   ins_encode %{
 3429     __ divsd($dst$$XMMRegister, $src$$Address);
 3430   %}
 3431   ins_pipe(pipe_slow);
 3432 %}
 3433 
 3434 instruct divD_imm(regD dst, immD con) %{
 3435   predicate((UseSSE>=2) && (UseAVX == 0));
 3436   match(Set dst (DivD dst con));
 3437   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
 3438   ins_cost(150);
 3439   ins_encode %{
 3440     __ divsd($dst$$XMMRegister, $constantaddress($con));
 3441   %}
 3442   ins_pipe(pipe_slow);
 3443 %}
 3444 
 3445 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
 3446   predicate(UseAVX > 0);
 3447   match(Set dst (DivD src1 src2));
 3448 
 3449   format %{ "vdivsd  $dst, $src1, $src2" %}
 3450   ins_cost(150);
 3451   ins_encode %{
 3452     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
 3453   %}
 3454   ins_pipe(pipe_slow);
 3455 %}
 3456 
 3457 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
 3458   predicate(UseAVX > 0);
 3459   match(Set dst (DivD src1 (LoadD src2)));
 3460 
 3461   format %{ "vdivsd  $dst, $src1, $src2" %}
 3462   ins_cost(150);
 3463   ins_encode %{
 3464     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
 3465   %}
 3466   ins_pipe(pipe_slow);
 3467 %}
 3468 
 3469 instruct divD_reg_imm(regD dst, regD src, immD con) %{
 3470   predicate(UseAVX > 0);
 3471   match(Set dst (DivD src con));
 3472 
 3473   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
 3474   ins_cost(150);
 3475   ins_encode %{
 3476     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
 3477   %}
 3478   ins_pipe(pipe_slow);
 3479 %}
 3480 
 3481 instruct absF_reg(regF dst) %{
 3482   predicate((UseSSE>=1) && (UseAVX == 0));
 3483   match(Set dst (AbsF dst));
 3484   ins_cost(150);
 3485   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
 3486   ins_encode %{
 3487     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
 3488   %}
 3489   ins_pipe(pipe_slow);
 3490 %}
 3491 
 3492 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
 3493   predicate(UseAVX > 0);
 3494   match(Set dst (AbsF src));
 3495   ins_cost(150);
 3496   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
 3497   ins_encode %{
 3498     int vlen_enc = Assembler::AVX_128bit;
 3499     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
 3500               ExternalAddress(float_signmask()), vlen_enc);
 3501   %}
 3502   ins_pipe(pipe_slow);
 3503 %}
 3504 
 3505 instruct absD_reg(regD dst) %{
 3506   predicate((UseSSE>=2) && (UseAVX == 0));
 3507   match(Set dst (AbsD dst));
 3508   ins_cost(150);
 3509   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
 3510             "# abs double by sign masking" %}
 3511   ins_encode %{
 3512     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
 3513   %}
 3514   ins_pipe(pipe_slow);
 3515 %}
 3516 
 3517 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
 3518   predicate(UseAVX > 0);
 3519   match(Set dst (AbsD src));
 3520   ins_cost(150);
 3521   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
 3522             "# abs double by sign masking" %}
 3523   ins_encode %{
 3524     int vlen_enc = Assembler::AVX_128bit;
 3525     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
 3526               ExternalAddress(double_signmask()), vlen_enc);
 3527   %}
 3528   ins_pipe(pipe_slow);
 3529 %}
 3530 
 3531 instruct negF_reg(regF dst) %{
 3532   predicate((UseSSE>=1) && (UseAVX == 0));
 3533   match(Set dst (NegF dst));
 3534   ins_cost(150);
 3535   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
 3536   ins_encode %{
 3537     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
 3538   %}
 3539   ins_pipe(pipe_slow);
 3540 %}
 3541 
 3542 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
 3543   predicate(UseAVX > 0);
 3544   match(Set dst (NegF src));
 3545   ins_cost(150);
 3546   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
 3547   ins_encode %{
 3548     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
 3549                  ExternalAddress(float_signflip()));
 3550   %}
 3551   ins_pipe(pipe_slow);
 3552 %}
 3553 
 3554 instruct negD_reg(regD dst) %{
 3555   predicate((UseSSE>=2) && (UseAVX == 0));
 3556   match(Set dst (NegD dst));
 3557   ins_cost(150);
 3558   format %{ "xorpd   $dst, [0x8000000000000000]\t"
 3559             "# neg double by sign flipping" %}
 3560   ins_encode %{
 3561     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
 3562   %}
 3563   ins_pipe(pipe_slow);
 3564 %}
 3565 
 3566 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
 3567   predicate(UseAVX > 0);
 3568   match(Set dst (NegD src));
 3569   ins_cost(150);
 3570   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
 3571             "# neg double by sign flipping" %}
 3572   ins_encode %{
 3573     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
 3574                  ExternalAddress(double_signflip()));
 3575   %}
 3576   ins_pipe(pipe_slow);
 3577 %}
 3578 
 3579 // sqrtss instruction needs destination register to be pre initialized for best performance
 3580 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3581 instruct sqrtF_reg(regF dst) %{
 3582   predicate(UseSSE>=1);
 3583   match(Set dst (SqrtF dst));
 3584   format %{ "sqrtss  $dst, $dst" %}
 3585   ins_encode %{
 3586     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
 3587   %}
 3588   ins_pipe(pipe_slow);
 3589 %}
 3590 
 3591 // sqrtsd instruction needs destination register to be pre initialized for best performance
 3592 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
 3593 instruct sqrtD_reg(regD dst) %{
 3594   predicate(UseSSE>=2);
 3595   match(Set dst (SqrtD dst));
 3596   format %{ "sqrtsd  $dst, $dst" %}
 3597   ins_encode %{
 3598     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
 3599   %}
 3600   ins_pipe(pipe_slow);
 3601 %}
 3602 
 3603 
 3604 // ---------------------------------------- VectorReinterpret ------------------------------------
 3605 instruct reinterpret_mask(kReg dst) %{
 3606   predicate(n->bottom_type()->isa_vectmask() &&
 3607             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
 3608   match(Set dst (VectorReinterpret dst));
 3609   ins_cost(125);
 3610   format %{ "vector_reinterpret $dst\t!" %}
 3611   ins_encode %{
 3612     // empty
 3613   %}
 3614   ins_pipe( pipe_slow );
 3615 %}
 3616 
 3617 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
 3618   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3619             n->bottom_type()->isa_vectmask() &&
 3620             n->in(1)->bottom_type()->isa_vectmask() &&
 3621             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
 3622             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3623   match(Set dst (VectorReinterpret src));
 3624   effect(TEMP xtmp);
 3625   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
 3626   ins_encode %{
 3627      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
 3628      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3629      assert(src_sz == dst_sz , "src and dst size mismatch");
 3630      int vlen_enc = vector_length_encoding(src_sz);
 3631      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3632      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3633   %}
 3634   ins_pipe( pipe_slow );
 3635 %}
 3636 
 3637 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
 3638   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3639             n->bottom_type()->isa_vectmask() &&
 3640             n->in(1)->bottom_type()->isa_vectmask() &&
 3641             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
 3642              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
 3643             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3644   match(Set dst (VectorReinterpret src));
 3645   effect(TEMP xtmp);
 3646   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
 3647   ins_encode %{
 3648      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
 3649      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3650      assert(src_sz == dst_sz , "src and dst size mismatch");
 3651      int vlen_enc = vector_length_encoding(src_sz);
 3652      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3653      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3654   %}
 3655   ins_pipe( pipe_slow );
 3656 %}
 3657 
 3658 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
 3659   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
 3660             n->bottom_type()->isa_vectmask() &&
 3661             n->in(1)->bottom_type()->isa_vectmask() &&
 3662             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
 3663              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
 3664             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
 3665   match(Set dst (VectorReinterpret src));
 3666   effect(TEMP xtmp);
 3667   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
 3668   ins_encode %{
 3669      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
 3670      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
 3671      assert(src_sz == dst_sz , "src and dst size mismatch");
 3672      int vlen_enc = vector_length_encoding(src_sz);
 3673      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
 3674      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
 3675   %}
 3676   ins_pipe( pipe_slow );
 3677 %}
 3678 
 3679 instruct reinterpret(vec dst) %{
 3680   predicate(!n->bottom_type()->isa_vectmask() &&
 3681             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
 3682   match(Set dst (VectorReinterpret dst));
 3683   ins_cost(125);
 3684   format %{ "vector_reinterpret $dst\t!" %}
 3685   ins_encode %{
 3686     // empty
 3687   %}
 3688   ins_pipe( pipe_slow );
 3689 %}
 3690 
 3691 instruct reinterpret_expand(vec dst, vec src, rRegP scratch) %{
 3692   predicate(UseAVX == 0 &&
 3693             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3694   match(Set dst (VectorReinterpret src));
 3695   ins_cost(125);
 3696   effect(TEMP dst, TEMP scratch);
 3697   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
 3698   ins_encode %{
 3699     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
 3700     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
 3701 
 3702     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
 3703     if (src_vlen_in_bytes == 4) {
 3704       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
 3705     } else {
 3706       assert(src_vlen_in_bytes == 8, "");
 3707       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), $scratch$$Register);
 3708     }
 3709     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 3710   %}
 3711   ins_pipe( pipe_slow );
 3712 %}
 3713 
 3714 instruct vreinterpret_expand4(legVec dst, vec src, rRegP scratch) %{
 3715   predicate(UseAVX > 0 &&
 3716             !n->bottom_type()->isa_vectmask() &&
 3717             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
 3718             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3719   match(Set dst (VectorReinterpret src));
 3720   ins_cost(125);
 3721   effect(TEMP scratch);
 3722   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
 3723   ins_encode %{
 3724     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, $scratch$$Register);
 3725   %}
 3726   ins_pipe( pipe_slow );
 3727 %}
 3728 
 3729 
 3730 instruct vreinterpret_expand(legVec dst, vec src) %{
 3731   predicate(UseAVX > 0 &&
 3732             !n->bottom_type()->isa_vectmask() &&
 3733             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
 3734             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
 3735   match(Set dst (VectorReinterpret src));
 3736   ins_cost(125);
 3737   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
 3738   ins_encode %{
 3739     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3740       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3741       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3742       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3743       default: ShouldNotReachHere();
 3744     }
 3745   %}
 3746   ins_pipe( pipe_slow );
 3747 %}
 3748 
 3749 instruct reinterpret_shrink(vec dst, legVec src) %{
 3750   predicate(!n->bottom_type()->isa_vectmask() &&
 3751             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
 3752   match(Set dst (VectorReinterpret src));
 3753   ins_cost(125);
 3754   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
 3755   ins_encode %{
 3756     switch (Matcher::vector_length_in_bytes(this)) {
 3757       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
 3758       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
 3759       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
 3760       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
 3761       default: ShouldNotReachHere();
 3762     }
 3763   %}
 3764   ins_pipe( pipe_slow );
 3765 %}
 3766 
 3767 // ----------------------------------------------------------------------------------------------------
 3768 
 3769 #ifdef _LP64
 3770 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
 3771   match(Set dst (RoundDoubleMode src rmode));
 3772   format %{ "roundsd $dst,$src" %}
 3773   ins_cost(150);
 3774   ins_encode %{
 3775     assert(UseSSE >= 4, "required");
 3776     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
 3777   %}
 3778   ins_pipe(pipe_slow);
 3779 %}
 3780 
 3781 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
 3782   match(Set dst (RoundDoubleMode (LoadD src) rmode));
 3783   format %{ "roundsd $dst,$src" %}
 3784   ins_cost(150);
 3785   ins_encode %{
 3786     assert(UseSSE >= 4, "required");
 3787     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
 3788   %}
 3789   ins_pipe(pipe_slow);
 3790 %}
 3791 
 3792 instruct roundD_imm(legRegD dst, immD con, immU8 rmode, rRegI scratch_reg) %{
 3793   match(Set dst (RoundDoubleMode con rmode));
 3794   effect(TEMP scratch_reg);
 3795   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
 3796   ins_cost(150);
 3797   ins_encode %{
 3798     assert(UseSSE >= 4, "required");
 3799     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, $scratch_reg$$Register);
 3800   %}
 3801   ins_pipe(pipe_slow);
 3802 %}
 3803 
 3804 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
 3805   predicate(Matcher::vector_length(n) < 8);
 3806   match(Set dst (RoundDoubleModeV src rmode));
 3807   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
 3808   ins_encode %{
 3809     assert(UseAVX > 0, "required");
 3810     int vlen_enc = vector_length_encoding(this);
 3811     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
 3812   %}
 3813   ins_pipe( pipe_slow );
 3814 %}
 3815 
 3816 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
 3817   predicate(Matcher::vector_length(n) == 8);
 3818   match(Set dst (RoundDoubleModeV src rmode));
 3819   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
 3820   ins_encode %{
 3821     assert(UseAVX > 2, "required");
 3822     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
 3823   %}
 3824   ins_pipe( pipe_slow );
 3825 %}
 3826 
 3827 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
 3828   predicate(Matcher::vector_length(n) < 8);
 3829   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3830   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
 3831   ins_encode %{
 3832     assert(UseAVX > 0, "required");
 3833     int vlen_enc = vector_length_encoding(this);
 3834     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
 3835   %}
 3836   ins_pipe( pipe_slow );
 3837 %}
 3838 
 3839 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
 3840   predicate(Matcher::vector_length(n) == 8);
 3841   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
 3842   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
 3843   ins_encode %{
 3844     assert(UseAVX > 2, "required");
 3845     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
 3846   %}
 3847   ins_pipe( pipe_slow );
 3848 %}
 3849 #endif // _LP64
 3850 
 3851 instruct onspinwait() %{
 3852   match(OnSpinWait);
 3853   ins_cost(200);
 3854 
 3855   format %{
 3856     $$template
 3857     $$emit$$"pause\t! membar_onspinwait"
 3858   %}
 3859   ins_encode %{
 3860     __ pause();
 3861   %}
 3862   ins_pipe(pipe_slow);
 3863 %}
 3864 
 3865 // a * b + c
 3866 instruct fmaD_reg(regD a, regD b, regD c) %{
 3867   predicate(UseFMA);
 3868   match(Set c (FmaD  c (Binary a b)));
 3869   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
 3870   ins_cost(150);
 3871   ins_encode %{
 3872     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3873   %}
 3874   ins_pipe( pipe_slow );
 3875 %}
 3876 
 3877 // a * b + c
 3878 instruct fmaF_reg(regF a, regF b, regF c) %{
 3879   predicate(UseFMA);
 3880   match(Set c (FmaF  c (Binary a b)));
 3881   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
 3882   ins_cost(150);
 3883   ins_encode %{
 3884     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
 3885   %}
 3886   ins_pipe( pipe_slow );
 3887 %}
 3888 
 3889 // ====================VECTOR INSTRUCTIONS=====================================
 3890 
 3891 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
 3892 instruct MoveVec2Leg(legVec dst, vec src) %{
 3893   match(Set dst src);
 3894   format %{ "" %}
 3895   ins_encode %{
 3896     ShouldNotReachHere();
 3897   %}
 3898   ins_pipe( fpu_reg_reg );
 3899 %}
 3900 
 3901 instruct MoveLeg2Vec(vec dst, legVec src) %{
 3902   match(Set dst src);
 3903   format %{ "" %}
 3904   ins_encode %{
 3905     ShouldNotReachHere();
 3906   %}
 3907   ins_pipe( fpu_reg_reg );
 3908 %}
 3909 
 3910 // ============================================================================
 3911 
 3912 // Load vectors generic operand pattern
 3913 instruct loadV(vec dst, memory mem) %{
 3914   match(Set dst (LoadVector mem));
 3915   ins_cost(125);
 3916   format %{ "load_vector $dst,$mem" %}
 3917   ins_encode %{
 3918     __ load_vector($dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
 3919   %}
 3920   ins_pipe( pipe_slow );
 3921 %}
 3922 
 3923 // Store vectors generic operand pattern.
 3924 instruct storeV(memory mem, vec src) %{
 3925   match(Set mem (StoreVector mem src));
 3926   ins_cost(145);
 3927   format %{ "store_vector $mem,$src\n\t" %}
 3928   ins_encode %{
 3929     switch (Matcher::vector_length_in_bytes(this, $src)) {
 3930       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
 3931       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
 3932       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
 3933       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
 3934       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
 3935       default: ShouldNotReachHere();
 3936     }
 3937   %}
 3938   ins_pipe( pipe_slow );
 3939 %}
 3940 
 3941 // ---------------------------------------- Gather ------------------------------------
 3942 
 3943 // Gather INT, LONG, FLOAT, DOUBLE
 3944 
 3945 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
 3946   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
 3947   match(Set dst (LoadVectorGather mem idx));
 3948   effect(TEMP dst, TEMP tmp, TEMP mask);
 3949   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
 3950   ins_encode %{
 3951     assert(UseAVX >= 2, "sanity");
 3952 
 3953     int vlen_enc = vector_length_encoding(this);
 3954     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 3955 
 3956     assert(Matcher::vector_length_in_bytes(this) >= 16, "sanity");
 3957     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 3958 
 3959     if (vlen_enc == Assembler::AVX_128bit) {
 3960       __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
 3961     } else {
 3962       __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
 3963     }
 3964     __ lea($tmp$$Register, $mem$$Address);
 3965     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 3966   %}
 3967   ins_pipe( pipe_slow );
 3968 %}
 3969 
 3970 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
 3971   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
 3972   match(Set dst (LoadVectorGather mem idx));
 3973   effect(TEMP dst, TEMP tmp, TEMP ktmp);
 3974   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
 3975   ins_encode %{
 3976     assert(UseAVX > 2, "sanity");
 3977 
 3978     int vlen_enc = vector_length_encoding(this);
 3979     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 3980 
 3981     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 3982 
 3983     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
 3984     __ lea($tmp$$Register, $mem$$Address);
 3985     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 3986   %}
 3987   ins_pipe( pipe_slow );
 3988 %}
 3989 
 3990 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 3991   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
 3992   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
 3993   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
 3994   ins_encode %{
 3995     assert(UseAVX > 2, "sanity");
 3996     int vlen_enc = vector_length_encoding(this);
 3997     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 3998     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 3999     // Note: Since gather instruction partially updates the opmask register used
 4000     // for predication hense moving mask operand to a temporary.
 4001     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4002     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4003     __ lea($tmp$$Register, $mem$$Address);
 4004     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
 4005   %}
 4006   ins_pipe( pipe_slow );
 4007 %}
 4008 // ====================Scatter=======================================
 4009 
 4010 // Scatter INT, LONG, FLOAT, DOUBLE
 4011 
 4012 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
 4013   predicate(UseAVX > 2);
 4014   match(Set mem (StoreVectorScatter mem (Binary src idx)));
 4015   effect(TEMP tmp, TEMP ktmp);
 4016   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
 4017   ins_encode %{
 4018     int vlen_enc = vector_length_encoding(this, $src);
 4019     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4020 
 4021     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4022     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4023 
 4024     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
 4025     __ lea($tmp$$Register, $mem$$Address);
 4026     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4027   %}
 4028   ins_pipe( pipe_slow );
 4029 %}
 4030 
 4031 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
 4032   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
 4033   effect(TEMP tmp, TEMP ktmp);
 4034   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
 4035   ins_encode %{
 4036     int vlen_enc = vector_length_encoding(this, $src);
 4037     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 4038     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
 4039     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
 4040     // Note: Since scatter instruction partially updates the opmask register used
 4041     // for predication hense moving mask operand to a temporary.
 4042     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
 4043     __ lea($tmp$$Register, $mem$$Address);
 4044     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
 4045   %}
 4046   ins_pipe( pipe_slow );
 4047 %}
 4048 
 4049 // ====================REPLICATE=======================================
 4050 
 4051 // Replicate byte scalar to be vector
 4052 instruct ReplB_reg(vec dst, rRegI src) %{
 4053   match(Set dst (ReplicateB src));
 4054   format %{ "replicateB $dst,$src" %}
 4055   ins_encode %{
 4056     uint vlen = Matcher::vector_length(this);
 4057     if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4058       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
 4059       int vlen_enc = vector_length_encoding(this);
 4060       __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
 4061     } else if (VM_Version::supports_avx2()) {
 4062       int vlen_enc = vector_length_encoding(this);
 4063       __ movdl($dst$$XMMRegister, $src$$Register);
 4064       __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4065     } else {
 4066       __ movdl($dst$$XMMRegister, $src$$Register);
 4067       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
 4068       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4069       if (vlen >= 16) {
 4070         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4071         if (vlen >= 32) {
 4072           assert(vlen == 32, "sanity");
 4073           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4074         }
 4075       }
 4076     }
 4077   %}
 4078   ins_pipe( pipe_slow );
 4079 %}
 4080 
 4081 instruct ReplB_mem(vec dst, memory mem) %{
 4082   predicate(VM_Version::supports_avx2());
 4083   match(Set dst (ReplicateB (LoadB mem)));
 4084   format %{ "replicateB $dst,$mem" %}
 4085   ins_encode %{
 4086     int vlen_enc = vector_length_encoding(this);
 4087     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4088   %}
 4089   ins_pipe( pipe_slow );
 4090 %}
 4091 
 4092 instruct ReplB_imm(vec dst, immI con) %{
 4093   match(Set dst (ReplicateB con));
 4094   format %{ "replicateB $dst,$con" %}
 4095   ins_encode %{
 4096     InternalAddress addr = $constantaddress(T_BYTE, vreplicate_imm(T_BYTE, $con$$constant, Matcher::vector_length(this)));
 4097     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
 4098   %}
 4099   ins_pipe( pipe_slow );
 4100 %}
 4101 
 4102 // ====================ReplicateS=======================================
 4103 
 4104 instruct ReplS_reg(vec dst, rRegI src) %{
 4105   match(Set dst (ReplicateS src));
 4106   format %{ "replicateS $dst,$src" %}
 4107   ins_encode %{
 4108     uint vlen = Matcher::vector_length(this);
 4109     if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
 4110       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
 4111       int vlen_enc = vector_length_encoding(this);
 4112       __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
 4113     } else if (VM_Version::supports_avx2()) {
 4114       int vlen_enc = vector_length_encoding(this);
 4115       __ movdl($dst$$XMMRegister, $src$$Register);
 4116       __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4117     } else {
 4118       __ movdl($dst$$XMMRegister, $src$$Register);
 4119       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4120       if (vlen >= 8) {
 4121         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4122         if (vlen >= 16) {
 4123           assert(vlen == 16, "sanity");
 4124           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4125         }
 4126       }
 4127     }
 4128   %}
 4129   ins_pipe( pipe_slow );
 4130 %}
 4131 
 4132 instruct ReplS_mem(vec dst, memory mem) %{
 4133   predicate(VM_Version::supports_avx2());
 4134   match(Set dst (ReplicateS (LoadS mem)));
 4135   format %{ "replicateS $dst,$mem" %}
 4136   ins_encode %{
 4137     int vlen_enc = vector_length_encoding(this);
 4138     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4139   %}
 4140   ins_pipe( pipe_slow );
 4141 %}
 4142 
 4143 instruct ReplS_imm(vec dst, immI con) %{
 4144   match(Set dst (ReplicateS con));
 4145   format %{ "replicateS $dst,$con" %}
 4146   ins_encode %{
 4147     InternalAddress addr = $constantaddress(T_SHORT, vreplicate_imm(T_SHORT, $con$$constant, Matcher::vector_length(this)));
 4148     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
 4149   %}
 4150   ins_pipe( pipe_slow );
 4151 %}
 4152 
 4153 // ====================ReplicateI=======================================
 4154 
 4155 instruct ReplI_reg(vec dst, rRegI src) %{
 4156   match(Set dst (ReplicateI src));
 4157   format %{ "replicateI $dst,$src" %}
 4158   ins_encode %{
 4159     uint vlen = Matcher::vector_length(this);
 4160     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4161       int vlen_enc = vector_length_encoding(this);
 4162       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
 4163     } else if (VM_Version::supports_avx2()) {
 4164       int vlen_enc = vector_length_encoding(this);
 4165       __ movdl($dst$$XMMRegister, $src$$Register);
 4166       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4167     } else {
 4168       __ movdl($dst$$XMMRegister, $src$$Register);
 4169       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4170       if (vlen >= 8) {
 4171         assert(vlen == 8, "sanity");
 4172         __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4173       }
 4174     }
 4175   %}
 4176   ins_pipe( pipe_slow );
 4177 %}
 4178 
 4179 instruct ReplI_mem(vec dst, memory mem) %{
 4180   match(Set dst (ReplicateI (LoadI mem)));
 4181   format %{ "replicateI $dst,$mem" %}
 4182   ins_encode %{
 4183     uint vlen = Matcher::vector_length(this);
 4184     if (vlen <= 4) {
 4185       __ movdl($dst$$XMMRegister, $mem$$Address);
 4186       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4187     } else {
 4188       assert(VM_Version::supports_avx2(), "sanity");
 4189       int vlen_enc = vector_length_encoding(this);
 4190       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4191     }
 4192   %}
 4193   ins_pipe( pipe_slow );
 4194 %}
 4195 
 4196 instruct ReplI_imm(vec dst, immI con) %{
 4197   match(Set dst (ReplicateI con));
 4198   format %{ "replicateI $dst,$con" %}
 4199   ins_encode %{
 4200     InternalAddress addr = $constantaddress(T_INT, vreplicate_imm(T_INT, $con$$constant, Matcher::vector_length(this)));
 4201     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
 4202   %}
 4203   ins_pipe( pipe_slow );
 4204 %}
 4205 
 4206 // Replicate scalar zero to be vector
 4207 instruct ReplI_zero(vec dst, immI_0 zero) %{
 4208   match(Set dst (ReplicateB zero));
 4209   match(Set dst (ReplicateS zero));
 4210   match(Set dst (ReplicateI zero));
 4211   format %{ "replicateI $dst,$zero" %}
 4212   ins_encode %{
 4213     uint vsize = Matcher::vector_length_in_bytes(this);
 4214     if (vsize <= 16) {
 4215       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4216     } else {
 4217       int vlen_enc = vector_length_encoding(this);
 4218       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4219     }
 4220   %}
 4221   ins_pipe( fpu_reg_reg );
 4222 %}
 4223 
 4224 instruct ReplI_M1(vec dst, immI_M1 con) %{
 4225   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) >= 16);
 4226   match(Set dst (ReplicateB con));
 4227   match(Set dst (ReplicateS con));
 4228   match(Set dst (ReplicateI con));
 4229   effect(TEMP dst);
 4230   format %{ "vallones $dst" %}
 4231   ins_encode %{
 4232     int vector_len = vector_length_encoding(this);
 4233     __ vallones($dst$$XMMRegister, vector_len);
 4234   %}
 4235   ins_pipe( pipe_slow );
 4236 %}
 4237 
 4238 // ====================ReplicateL=======================================
 4239 
 4240 #ifdef _LP64
 4241 // Replicate long (8 byte) scalar to be vector
 4242 instruct ReplL_reg(vec dst, rRegL src) %{
 4243   match(Set dst (ReplicateL src));
 4244   format %{ "replicateL $dst,$src" %}
 4245   ins_encode %{
 4246     uint vlen = Matcher::vector_length(this);
 4247     if (vlen == 2) {
 4248       __ movdq($dst$$XMMRegister, $src$$Register);
 4249       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4250     } else if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4251       int vlen_enc = vector_length_encoding(this);
 4252       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
 4253     } else if (VM_Version::supports_avx2()) {
 4254       assert(vlen == 4, "sanity");
 4255       int vlen_enc = vector_length_encoding(this);
 4256       __ movdq($dst$$XMMRegister, $src$$Register);
 4257       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4258     } else {
 4259       assert(vlen == 4, "sanity");
 4260       __ movdq($dst$$XMMRegister, $src$$Register);
 4261       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4262       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4263     }
 4264   %}
 4265   ins_pipe( pipe_slow );
 4266 %}
 4267 #else // _LP64
 4268 // Replicate long (8 byte) scalar to be vector
 4269 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
 4270   predicate(Matcher::vector_length(n) <= 4);
 4271   match(Set dst (ReplicateL src));
 4272   effect(TEMP dst, USE src, TEMP tmp);
 4273   format %{ "replicateL $dst,$src" %}
 4274   ins_encode %{
 4275     uint vlen = Matcher::vector_length(this);
 4276     if (vlen == 2) {
 4277       __ movdl($dst$$XMMRegister, $src$$Register);
 4278       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4279       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4280       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4281     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
 4282       int vlen_enc = Assembler::AVX_256bit;
 4283       __ movdl($dst$$XMMRegister, $src$$Register);
 4284       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4285       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4286       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4287     } else {
 4288       __ movdl($dst$$XMMRegister, $src$$Register);
 4289       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4290       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4291       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4292       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4293     }
 4294   %}
 4295   ins_pipe( pipe_slow );
 4296 %}
 4297 
 4298 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
 4299   predicate(Matcher::vector_length(n) == 8);
 4300   match(Set dst (ReplicateL src));
 4301   effect(TEMP dst, USE src, TEMP tmp);
 4302   format %{ "replicateL $dst,$src" %}
 4303   ins_encode %{
 4304     if (VM_Version::supports_avx512vl()) {
 4305       __ movdl($dst$$XMMRegister, $src$$Register);
 4306       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4307       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4308       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4309       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4310       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
 4311     } else {
 4312       int vlen_enc = Assembler::AVX_512bit;
 4313       __ movdl($dst$$XMMRegister, $src$$Register);
 4314       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
 4315       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
 4316       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4317     }
 4318   %}
 4319   ins_pipe( pipe_slow );
 4320 %}
 4321 #endif // _LP64
 4322 
 4323 instruct ReplL_mem(vec dst, memory mem) %{
 4324   match(Set dst (ReplicateL (LoadL mem)));
 4325   format %{ "replicateL $dst,$mem" %}
 4326   ins_encode %{
 4327     uint vlen = Matcher::vector_length(this);
 4328     if (vlen == 2) {
 4329       __ movq($dst$$XMMRegister, $mem$$Address);
 4330       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
 4331     } else {
 4332       assert(VM_Version::supports_avx2(), "sanity");
 4333       int vlen_enc = vector_length_encoding(this);
 4334       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4335     }
 4336   %}
 4337   ins_pipe( pipe_slow );
 4338 %}
 4339 
 4340 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
 4341 instruct ReplL_imm(vec dst, immL con) %{
 4342   match(Set dst (ReplicateL con));
 4343   format %{ "replicateL $dst,$con" %}
 4344   ins_encode %{
 4345     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, Matcher::vector_length(this)));
 4346     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
 4347   %}
 4348   ins_pipe( pipe_slow );
 4349 %}
 4350 
 4351 instruct ReplL_zero(vec dst, immL0 zero) %{
 4352   match(Set dst (ReplicateL zero));
 4353   format %{ "replicateL $dst,$zero" %}
 4354   ins_encode %{
 4355     int vlen = Matcher::vector_length(this);
 4356     if (vlen == 2) {
 4357       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
 4358     } else {
 4359       int vlen_enc = vector_length_encoding(this);
 4360       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 4361     }
 4362   %}
 4363   ins_pipe( fpu_reg_reg );
 4364 %}
 4365 
 4366 instruct ReplL_M1(vec dst, immL_M1 con) %{
 4367   predicate(UseAVX > 0);
 4368   match(Set dst (ReplicateL con));
 4369   effect(TEMP dst);
 4370   format %{ "vallones $dst" %}
 4371   ins_encode %{
 4372     int vector_len = vector_length_encoding(this);
 4373     __ vallones($dst$$XMMRegister, vector_len);
 4374   %}
 4375   ins_pipe( pipe_slow );
 4376 %}
 4377 
 4378 // ====================ReplicateF=======================================
 4379 
 4380 instruct ReplF_reg(vec dst, vlRegF src) %{
 4381   match(Set dst (ReplicateF src));
 4382   format %{ "replicateF $dst,$src" %}
 4383   ins_encode %{
 4384     uint vlen = Matcher::vector_length(this);
 4385     if (vlen <= 4) {
 4386       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4387    } else if (VM_Version::supports_avx2()) {
 4388       int vlen_enc = vector_length_encoding(this);
 4389       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4390     } else {
 4391       assert(vlen == 8, "sanity");
 4392       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
 4393       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4394     }
 4395   %}
 4396   ins_pipe( pipe_slow );
 4397 %}
 4398 
 4399 instruct ReplF_mem(vec dst, memory mem) %{
 4400   match(Set dst (ReplicateF (LoadF mem)));
 4401   format %{ "replicateF $dst,$mem" %}
 4402   ins_encode %{
 4403     uint vlen = Matcher::vector_length(this);
 4404     if (vlen <= 4) {
 4405       __ movdl($dst$$XMMRegister, $mem$$Address);
 4406       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
 4407     } else {
 4408       assert(VM_Version::supports_avx(), "sanity");
 4409       int vlen_enc = vector_length_encoding(this);
 4410       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4411     }
 4412   %}
 4413   ins_pipe( pipe_slow );
 4414 %}
 4415 
 4416 // Replicate float scalar immediate to be vector by loading from const table.
 4417 instruct ReplF_imm(vec dst, immF con) %{
 4418   match(Set dst (ReplicateF con));
 4419   format %{ "replicateF $dst,$con" %}
 4420   ins_encode %{
 4421     InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant, Matcher::vector_length(this)));
 4422     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
 4423   %}
 4424   ins_pipe( pipe_slow );
 4425 %}
 4426 
 4427 instruct ReplF_zero(vec dst, immF0 zero) %{
 4428   match(Set dst (ReplicateF zero));
 4429   format %{ "replicateF $dst,$zero" %}
 4430   ins_encode %{
 4431     uint vlen = Matcher::vector_length(this);
 4432     if (vlen <= 4) {
 4433       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
 4434     } else {
 4435       int vlen_enc = vector_length_encoding(this);
 4436       __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
 4437     }
 4438   %}
 4439   ins_pipe( fpu_reg_reg );
 4440 %}
 4441 
 4442 // ====================ReplicateD=======================================
 4443 
 4444 // Replicate double (8 bytes) scalar to be vector
 4445 instruct ReplD_reg(vec dst, vlRegD src) %{
 4446   match(Set dst (ReplicateD src));
 4447   format %{ "replicateD $dst,$src" %}
 4448   ins_encode %{
 4449     uint vlen = Matcher::vector_length(this);
 4450     if (vlen == 2) {
 4451       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4452     } else if (VM_Version::supports_avx2()) {
 4453       int vlen_enc = vector_length_encoding(this);
 4454       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
 4455     } else {
 4456       assert(vlen == 4, "sanity");
 4457       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
 4458       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
 4459     }
 4460   %}
 4461   ins_pipe( pipe_slow );
 4462 %}
 4463 
 4464 instruct ReplD_mem(vec dst, memory mem) %{
 4465   match(Set dst (ReplicateD (LoadD mem)));
 4466   format %{ "replicateD $dst,$mem" %}
 4467   ins_encode %{
 4468     uint vlen = Matcher::vector_length(this);
 4469     if (vlen == 2) {
 4470       __ movq($dst$$XMMRegister, $mem$$Address);
 4471       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44);
 4472     } else {
 4473       assert(VM_Version::supports_avx(), "sanity");
 4474       int vlen_enc = vector_length_encoding(this);
 4475       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 4476     }
 4477   %}
 4478   ins_pipe( pipe_slow );
 4479 %}
 4480 
 4481 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
 4482 instruct ReplD_imm(vec dst, immD con) %{
 4483   match(Set dst (ReplicateD con));
 4484   format %{ "replicateD $dst,$con" %}
 4485   ins_encode %{
 4486     InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, Matcher::vector_length(this)));
 4487     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
 4488   %}
 4489   ins_pipe( pipe_slow );
 4490 %}
 4491 
 4492 instruct ReplD_zero(vec dst, immD0 zero) %{
 4493   match(Set dst (ReplicateD zero));
 4494   format %{ "replicateD $dst,$zero" %}
 4495   ins_encode %{
 4496     uint vlen = Matcher::vector_length(this);
 4497     if (vlen == 2) {
 4498       __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
 4499     } else {
 4500       int vlen_enc = vector_length_encoding(this);
 4501       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
 4502     }
 4503   %}
 4504   ins_pipe( fpu_reg_reg );
 4505 %}
 4506 
 4507 // ====================VECTOR INSERT=======================================
 4508 
 4509 instruct insert(vec dst, rRegI val, immU8 idx) %{
 4510   predicate(Matcher::vector_length_in_bytes(n) < 32);
 4511   match(Set dst (VectorInsert (Binary dst val) idx));
 4512   format %{ "vector_insert $dst,$val,$idx" %}
 4513   ins_encode %{
 4514     assert(UseSSE >= 4, "required");
 4515     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
 4516 
 4517     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4518 
 4519     assert(is_integral_type(elem_bt), "");
 4520     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4521 
 4522     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
 4523   %}
 4524   ins_pipe( pipe_slow );
 4525 %}
 4526 
 4527 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
 4528   predicate(Matcher::vector_length_in_bytes(n) == 32);
 4529   match(Set dst (VectorInsert (Binary src val) idx));
 4530   effect(TEMP vtmp);
 4531   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4532   ins_encode %{
 4533     int vlen_enc = Assembler::AVX_256bit;
 4534     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4535     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4536     int log2epr = log2(elem_per_lane);
 4537 
 4538     assert(is_integral_type(elem_bt), "sanity");
 4539     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4540 
 4541     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4542     uint y_idx = ($idx$$constant >> log2epr) & 1;
 4543     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4544     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4545     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4546   %}
 4547   ins_pipe( pipe_slow );
 4548 %}
 4549 
 4550 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
 4551   predicate(Matcher::vector_length_in_bytes(n) == 64);
 4552   match(Set dst (VectorInsert (Binary src val) idx));
 4553   effect(TEMP vtmp);
 4554   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4555   ins_encode %{
 4556     assert(UseAVX > 2, "sanity");
 4557 
 4558     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 4559     int elem_per_lane = 16/type2aelembytes(elem_bt);
 4560     int log2epr = log2(elem_per_lane);
 4561 
 4562     assert(is_integral_type(elem_bt), "");
 4563     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4564 
 4565     uint x_idx = $idx$$constant & right_n_bits(log2epr);
 4566     uint y_idx = ($idx$$constant >> log2epr) & 3;
 4567     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4568     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4569     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4570   %}
 4571   ins_pipe( pipe_slow );
 4572 %}
 4573 
 4574 #ifdef _LP64
 4575 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
 4576   predicate(Matcher::vector_length(n) == 2);
 4577   match(Set dst (VectorInsert (Binary dst val) idx));
 4578   format %{ "vector_insert $dst,$val,$idx" %}
 4579   ins_encode %{
 4580     assert(UseSSE >= 4, "required");
 4581     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4582     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4583 
 4584     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
 4585   %}
 4586   ins_pipe( pipe_slow );
 4587 %}
 4588 
 4589 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
 4590   predicate(Matcher::vector_length(n) == 4);
 4591   match(Set dst (VectorInsert (Binary src val) idx));
 4592   effect(TEMP vtmp);
 4593   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4594   ins_encode %{
 4595     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
 4596     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4597 
 4598     uint x_idx = $idx$$constant & right_n_bits(1);
 4599     uint y_idx = ($idx$$constant >> 1) & 1;
 4600     int vlen_enc = Assembler::AVX_256bit;
 4601     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4602     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4603     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4604   %}
 4605   ins_pipe( pipe_slow );
 4606 %}
 4607 
 4608 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
 4609   predicate(Matcher::vector_length(n) == 8);
 4610   match(Set dst (VectorInsert (Binary src val) idx));
 4611   effect(TEMP vtmp);
 4612   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4613   ins_encode %{
 4614     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
 4615     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4616 
 4617     uint x_idx = $idx$$constant & right_n_bits(1);
 4618     uint y_idx = ($idx$$constant >> 1) & 3;
 4619     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4620     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
 4621     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4622   %}
 4623   ins_pipe( pipe_slow );
 4624 %}
 4625 #endif
 4626 
 4627 instruct insertF(vec dst, regF val, immU8 idx) %{
 4628   predicate(Matcher::vector_length(n) < 8);
 4629   match(Set dst (VectorInsert (Binary dst val) idx));
 4630   format %{ "vector_insert $dst,$val,$idx" %}
 4631   ins_encode %{
 4632     assert(UseSSE >= 4, "sanity");
 4633 
 4634     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4635     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4636 
 4637     uint x_idx = $idx$$constant & right_n_bits(2);
 4638     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4639   %}
 4640   ins_pipe( pipe_slow );
 4641 %}
 4642 
 4643 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
 4644   predicate(Matcher::vector_length(n) >= 8);
 4645   match(Set dst (VectorInsert (Binary src val) idx));
 4646   effect(TEMP vtmp);
 4647   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4648   ins_encode %{
 4649     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
 4650     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4651 
 4652     int vlen = Matcher::vector_length(this);
 4653     uint x_idx = $idx$$constant & right_n_bits(2);
 4654     if (vlen == 8) {
 4655       uint y_idx = ($idx$$constant >> 2) & 1;
 4656       int vlen_enc = Assembler::AVX_256bit;
 4657       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4658       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4659       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4660     } else {
 4661       assert(vlen == 16, "sanity");
 4662       uint y_idx = ($idx$$constant >> 2) & 3;
 4663       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4664       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
 4665       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4666     }
 4667   %}
 4668   ins_pipe( pipe_slow );
 4669 %}
 4670 
 4671 #ifdef _LP64
 4672 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
 4673   predicate(Matcher::vector_length(n) == 2);
 4674   match(Set dst (VectorInsert (Binary dst val) idx));
 4675   effect(TEMP tmp);
 4676   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
 4677   ins_encode %{
 4678     assert(UseSSE >= 4, "sanity");
 4679     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4680     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4681 
 4682     __ movq($tmp$$Register, $val$$XMMRegister);
 4683     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
 4684   %}
 4685   ins_pipe( pipe_slow );
 4686 %}
 4687 
 4688 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
 4689   predicate(Matcher::vector_length(n) == 4);
 4690   match(Set dst (VectorInsert (Binary src val) idx));
 4691   effect(TEMP vtmp, TEMP tmp);
 4692   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
 4693   ins_encode %{
 4694     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4695     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4696 
 4697     uint x_idx = $idx$$constant & right_n_bits(1);
 4698     uint y_idx = ($idx$$constant >> 1) & 1;
 4699     int vlen_enc = Assembler::AVX_256bit;
 4700     __ movq($tmp$$Register, $val$$XMMRegister);
 4701     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4702     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4703     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4704   %}
 4705   ins_pipe( pipe_slow );
 4706 %}
 4707 
 4708 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
 4709   predicate(Matcher::vector_length(n) == 8);
 4710   match(Set dst (VectorInsert (Binary src val) idx));
 4711   effect(TEMP tmp, TEMP vtmp);
 4712   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
 4713   ins_encode %{
 4714     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
 4715     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
 4716 
 4717     uint x_idx = $idx$$constant & right_n_bits(1);
 4718     uint y_idx = ($idx$$constant >> 1) & 3;
 4719     __ movq($tmp$$Register, $val$$XMMRegister);
 4720     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
 4721     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
 4722     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
 4723   %}
 4724   ins_pipe( pipe_slow );
 4725 %}
 4726 #endif
 4727 
 4728 // ====================REDUCTION ARITHMETIC=======================================
 4729 
 4730 // =======================Int Reduction==========================================
 4731 
 4732 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4733   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
 4734   match(Set dst (AddReductionVI src1 src2));
 4735   match(Set dst (MulReductionVI src1 src2));
 4736   match(Set dst (AndReductionV  src1 src2));
 4737   match(Set dst ( OrReductionV  src1 src2));
 4738   match(Set dst (XorReductionV  src1 src2));
 4739   match(Set dst (MinReductionV  src1 src2));
 4740   match(Set dst (MaxReductionV  src1 src2));
 4741   effect(TEMP vtmp1, TEMP vtmp2);
 4742   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4743   ins_encode %{
 4744     int opcode = this->ideal_Opcode();
 4745     int vlen = Matcher::vector_length(this, $src2);
 4746     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4747   %}
 4748   ins_pipe( pipe_slow );
 4749 %}
 4750 
 4751 // =======================Long Reduction==========================================
 4752 
 4753 #ifdef _LP64
 4754 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4755   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
 4756   match(Set dst (AddReductionVL src1 src2));
 4757   match(Set dst (MulReductionVL src1 src2));
 4758   match(Set dst (AndReductionV  src1 src2));
 4759   match(Set dst ( OrReductionV  src1 src2));
 4760   match(Set dst (XorReductionV  src1 src2));
 4761   match(Set dst (MinReductionV  src1 src2));
 4762   match(Set dst (MaxReductionV  src1 src2));
 4763   effect(TEMP vtmp1, TEMP vtmp2);
 4764   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4765   ins_encode %{
 4766     int opcode = this->ideal_Opcode();
 4767     int vlen = Matcher::vector_length(this, $src2);
 4768     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4769   %}
 4770   ins_pipe( pipe_slow );
 4771 %}
 4772 
 4773 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
 4774   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
 4775   match(Set dst (AddReductionVL src1 src2));
 4776   match(Set dst (MulReductionVL src1 src2));
 4777   match(Set dst (AndReductionV  src1 src2));
 4778   match(Set dst ( OrReductionV  src1 src2));
 4779   match(Set dst (XorReductionV  src1 src2));
 4780   match(Set dst (MinReductionV  src1 src2));
 4781   match(Set dst (MaxReductionV  src1 src2));
 4782   effect(TEMP vtmp1, TEMP vtmp2);
 4783   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4784   ins_encode %{
 4785     int opcode = this->ideal_Opcode();
 4786     int vlen = Matcher::vector_length(this, $src2);
 4787     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4788   %}
 4789   ins_pipe( pipe_slow );
 4790 %}
 4791 #endif // _LP64
 4792 
 4793 // =======================Float Reduction==========================================
 4794 
 4795 instruct reductionF128(regF dst, vec src, vec vtmp) %{
 4796   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
 4797   match(Set dst (AddReductionVF dst src));
 4798   match(Set dst (MulReductionVF dst src));
 4799   effect(TEMP dst, TEMP vtmp);
 4800   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
 4801   ins_encode %{
 4802     int opcode = this->ideal_Opcode();
 4803     int vlen = Matcher::vector_length(this, $src);
 4804     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4805   %}
 4806   ins_pipe( pipe_slow );
 4807 %}
 4808 
 4809 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
 4810   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 4811   match(Set dst (AddReductionVF dst src));
 4812   match(Set dst (MulReductionVF dst src));
 4813   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4814   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4815   ins_encode %{
 4816     int opcode = this->ideal_Opcode();
 4817     int vlen = Matcher::vector_length(this, $src);
 4818     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4819   %}
 4820   ins_pipe( pipe_slow );
 4821 %}
 4822 
 4823 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 4824   predicate(Matcher::vector_length(n->in(2)) == 16); // src
 4825   match(Set dst (AddReductionVF dst src));
 4826   match(Set dst (MulReductionVF dst src));
 4827   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4828   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4829   ins_encode %{
 4830     int opcode = this->ideal_Opcode();
 4831     int vlen = Matcher::vector_length(this, $src);
 4832     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4833   %}
 4834   ins_pipe( pipe_slow );
 4835 %}
 4836 
 4837 // =======================Double Reduction==========================================
 4838 
 4839 instruct reduction2D(regD dst, vec src, vec vtmp) %{
 4840   predicate(Matcher::vector_length(n->in(2)) == 2); // src
 4841   match(Set dst (AddReductionVD dst src));
 4842   match(Set dst (MulReductionVD dst src));
 4843   effect(TEMP dst, TEMP vtmp);
 4844   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
 4845   ins_encode %{
 4846     int opcode = this->ideal_Opcode();
 4847     int vlen = Matcher::vector_length(this, $src);
 4848     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
 4849 %}
 4850   ins_pipe( pipe_slow );
 4851 %}
 4852 
 4853 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
 4854   predicate(Matcher::vector_length(n->in(2)) == 4); // src
 4855   match(Set dst (AddReductionVD dst src));
 4856   match(Set dst (MulReductionVD dst src));
 4857   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4858   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4859   ins_encode %{
 4860     int opcode = this->ideal_Opcode();
 4861     int vlen = Matcher::vector_length(this, $src);
 4862     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4863   %}
 4864   ins_pipe( pipe_slow );
 4865 %}
 4866 
 4867 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
 4868   predicate(Matcher::vector_length(n->in(2)) == 8); // src
 4869   match(Set dst (AddReductionVD dst src));
 4870   match(Set dst (MulReductionVD dst src));
 4871   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4872   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
 4873   ins_encode %{
 4874     int opcode = this->ideal_Opcode();
 4875     int vlen = Matcher::vector_length(this, $src);
 4876     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4877   %}
 4878   ins_pipe( pipe_slow );
 4879 %}
 4880 
 4881 // =======================Byte Reduction==========================================
 4882 
 4883 #ifdef _LP64
 4884 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4885   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
 4886   match(Set dst (AddReductionVI src1 src2));
 4887   match(Set dst (AndReductionV  src1 src2));
 4888   match(Set dst ( OrReductionV  src1 src2));
 4889   match(Set dst (XorReductionV  src1 src2));
 4890   match(Set dst (MinReductionV  src1 src2));
 4891   match(Set dst (MaxReductionV  src1 src2));
 4892   effect(TEMP vtmp1, TEMP vtmp2);
 4893   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4894   ins_encode %{
 4895     int opcode = this->ideal_Opcode();
 4896     int vlen = Matcher::vector_length(this, $src2);
 4897     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4898   %}
 4899   ins_pipe( pipe_slow );
 4900 %}
 4901 
 4902 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 4903   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
 4904   match(Set dst (AddReductionVI src1 src2));
 4905   match(Set dst (AndReductionV  src1 src2));
 4906   match(Set dst ( OrReductionV  src1 src2));
 4907   match(Set dst (XorReductionV  src1 src2));
 4908   match(Set dst (MinReductionV  src1 src2));
 4909   match(Set dst (MaxReductionV  src1 src2));
 4910   effect(TEMP vtmp1, TEMP vtmp2);
 4911   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4912   ins_encode %{
 4913     int opcode = this->ideal_Opcode();
 4914     int vlen = Matcher::vector_length(this, $src2);
 4915     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4916   %}
 4917   ins_pipe( pipe_slow );
 4918 %}
 4919 #endif
 4920 
 4921 // =======================Short Reduction==========================================
 4922 
 4923 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4924   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
 4925   match(Set dst (AddReductionVI src1 src2));
 4926   match(Set dst (MulReductionVI src1 src2));
 4927   match(Set dst (AndReductionV  src1 src2));
 4928   match(Set dst ( OrReductionV  src1 src2));
 4929   match(Set dst (XorReductionV  src1 src2));
 4930   match(Set dst (MinReductionV  src1 src2));
 4931   match(Set dst (MaxReductionV  src1 src2));
 4932   effect(TEMP vtmp1, TEMP vtmp2);
 4933   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
 4934   ins_encode %{
 4935     int opcode = this->ideal_Opcode();
 4936     int vlen = Matcher::vector_length(this, $src2);
 4937     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4938   %}
 4939   ins_pipe( pipe_slow );
 4940 %}
 4941 
 4942 // =======================Mul Reduction==========================================
 4943 
 4944 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
 4945   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 4946             Matcher::vector_length(n->in(2)) <= 32); // src2
 4947   match(Set dst (MulReductionVI src1 src2));
 4948   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4949   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 4950   ins_encode %{
 4951     int opcode = this->ideal_Opcode();
 4952     int vlen = Matcher::vector_length(this, $src2);
 4953     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4954   %}
 4955   ins_pipe( pipe_slow );
 4956 %}
 4957 
 4958 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
 4959   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
 4960             Matcher::vector_length(n->in(2)) == 64); // src2
 4961   match(Set dst (MulReductionVI src1 src2));
 4962   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
 4963   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
 4964   ins_encode %{
 4965     int opcode = this->ideal_Opcode();
 4966     int vlen = Matcher::vector_length(this, $src2);
 4967     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 4968   %}
 4969   ins_pipe( pipe_slow );
 4970 %}
 4971 
 4972 //--------------------Min/Max Float Reduction --------------------
 4973 // Float Min Reduction
 4974 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
 4975                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 4976   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 4977             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 4978              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 4979             Matcher::vector_length(n->in(2)) == 2);
 4980   match(Set dst (MinReductionV src1 src2));
 4981   match(Set dst (MaxReductionV src1 src2));
 4982   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 4983   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 4984   ins_encode %{
 4985     assert(UseAVX > 0, "sanity");
 4986 
 4987     int opcode = this->ideal_Opcode();
 4988     int vlen = Matcher::vector_length(this, $src2);
 4989     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 4990                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 4991   %}
 4992   ins_pipe( pipe_slow );
 4993 %}
 4994 
 4995 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
 4996                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 4997   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 4998             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
 4999              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
 5000             Matcher::vector_length(n->in(2)) >= 4);
 5001   match(Set dst (MinReductionV src1 src2));
 5002   match(Set dst (MaxReductionV src1 src2));
 5003   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5004   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5005   ins_encode %{
 5006     assert(UseAVX > 0, "sanity");
 5007 
 5008     int opcode = this->ideal_Opcode();
 5009     int vlen = Matcher::vector_length(this, $src2);
 5010     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
 5011                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5012   %}
 5013   ins_pipe( pipe_slow );
 5014 %}
 5015 
 5016 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
 5017                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
 5018   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5019             Matcher::vector_length(n->in(2)) == 2);
 5020   match(Set dst (MinReductionV dst src));
 5021   match(Set dst (MaxReductionV dst src));
 5022   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
 5023   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
 5024   ins_encode %{
 5025     assert(UseAVX > 0, "sanity");
 5026 
 5027     int opcode = this->ideal_Opcode();
 5028     int vlen = Matcher::vector_length(this, $src);
 5029     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5030                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
 5031   %}
 5032   ins_pipe( pipe_slow );
 5033 %}
 5034 
 5035 
 5036 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
 5037                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
 5038   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
 5039             Matcher::vector_length(n->in(2)) >= 4);
 5040   match(Set dst (MinReductionV dst src));
 5041   match(Set dst (MaxReductionV dst src));
 5042   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
 5043   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
 5044   ins_encode %{
 5045     assert(UseAVX > 0, "sanity");
 5046 
 5047     int opcode = this->ideal_Opcode();
 5048     int vlen = Matcher::vector_length(this, $src);
 5049     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
 5050                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
 5051   %}
 5052   ins_pipe( pipe_slow );
 5053 %}
 5054 
 5055 
 5056 //--------------------Min Double Reduction --------------------
 5057 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
 5058                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5059                             rFlagsReg cr) %{
 5060   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5061             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5062              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5063             Matcher::vector_length(n->in(2)) == 2);
 5064   match(Set dst (MinReductionV src1 src2));
 5065   match(Set dst (MaxReductionV src1 src2));
 5066   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5067   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5068   ins_encode %{
 5069     assert(UseAVX > 0, "sanity");
 5070 
 5071     int opcode = this->ideal_Opcode();
 5072     int vlen = Matcher::vector_length(this, $src2);
 5073     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5074                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5075   %}
 5076   ins_pipe( pipe_slow );
 5077 %}
 5078 
 5079 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
 5080                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5081                            rFlagsReg cr) %{
 5082   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5083             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
 5084              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
 5085             Matcher::vector_length(n->in(2)) >= 4);
 5086   match(Set dst (MinReductionV src1 src2));
 5087   match(Set dst (MaxReductionV src1 src2));
 5088   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5089   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5090   ins_encode %{
 5091     assert(UseAVX > 0, "sanity");
 5092 
 5093     int opcode = this->ideal_Opcode();
 5094     int vlen = Matcher::vector_length(this, $src2);
 5095     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
 5096                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5097   %}
 5098   ins_pipe( pipe_slow );
 5099 %}
 5100 
 5101 
 5102 instruct minmax_reduction2D_av(legRegD dst, legVec src,
 5103                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
 5104                                rFlagsReg cr) %{
 5105   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5106             Matcher::vector_length(n->in(2)) == 2);
 5107   match(Set dst (MinReductionV dst src));
 5108   match(Set dst (MaxReductionV dst src));
 5109   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
 5110   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
 5111   ins_encode %{
 5112     assert(UseAVX > 0, "sanity");
 5113 
 5114     int opcode = this->ideal_Opcode();
 5115     int vlen = Matcher::vector_length(this, $src);
 5116     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5117                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
 5118   %}
 5119   ins_pipe( pipe_slow );
 5120 %}
 5121 
 5122 instruct minmax_reductionD_av(legRegD dst, legVec src,
 5123                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
 5124                               rFlagsReg cr) %{
 5125   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
 5126             Matcher::vector_length(n->in(2)) >= 4);
 5127   match(Set dst (MinReductionV dst src));
 5128   match(Set dst (MaxReductionV dst src));
 5129   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
 5130   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
 5131   ins_encode %{
 5132     assert(UseAVX > 0, "sanity");
 5133 
 5134     int opcode = this->ideal_Opcode();
 5135     int vlen = Matcher::vector_length(this, $src);
 5136     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
 5137                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
 5138   %}
 5139   ins_pipe( pipe_slow );
 5140 %}
 5141 
 5142 // ====================VECTOR ARITHMETIC=======================================
 5143 
 5144 // --------------------------------- ADD --------------------------------------
 5145 
 5146 // Bytes vector add
 5147 instruct vaddB(vec dst, vec src) %{
 5148   predicate(UseAVX == 0);
 5149   match(Set dst (AddVB dst src));
 5150   format %{ "paddb   $dst,$src\t! add packedB" %}
 5151   ins_encode %{
 5152     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
 5153   %}
 5154   ins_pipe( pipe_slow );
 5155 %}
 5156 
 5157 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
 5158   predicate(UseAVX > 0);
 5159   match(Set dst (AddVB src1 src2));
 5160   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
 5161   ins_encode %{
 5162     int vlen_enc = vector_length_encoding(this);
 5163     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5164   %}
 5165   ins_pipe( pipe_slow );
 5166 %}
 5167 
 5168 instruct vaddB_mem(vec dst, vec src, memory mem) %{
 5169   predicate((UseAVX > 0) &&
 5170             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5171   match(Set dst (AddVB src (LoadVector mem)));
 5172   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
 5173   ins_encode %{
 5174     int vlen_enc = vector_length_encoding(this);
 5175     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5176   %}
 5177   ins_pipe( pipe_slow );
 5178 %}
 5179 
 5180 // Shorts/Chars vector add
 5181 instruct vaddS(vec dst, vec src) %{
 5182   predicate(UseAVX == 0);
 5183   match(Set dst (AddVS dst src));
 5184   format %{ "paddw   $dst,$src\t! add packedS" %}
 5185   ins_encode %{
 5186     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
 5187   %}
 5188   ins_pipe( pipe_slow );
 5189 %}
 5190 
 5191 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
 5192   predicate(UseAVX > 0);
 5193   match(Set dst (AddVS src1 src2));
 5194   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
 5195   ins_encode %{
 5196     int vlen_enc = vector_length_encoding(this);
 5197     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5198   %}
 5199   ins_pipe( pipe_slow );
 5200 %}
 5201 
 5202 instruct vaddS_mem(vec dst, vec src, memory mem) %{
 5203   predicate((UseAVX > 0) &&
 5204             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5205   match(Set dst (AddVS src (LoadVector mem)));
 5206   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
 5207   ins_encode %{
 5208     int vlen_enc = vector_length_encoding(this);
 5209     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5210   %}
 5211   ins_pipe( pipe_slow );
 5212 %}
 5213 
 5214 // Integers vector add
 5215 instruct vaddI(vec dst, vec src) %{
 5216   predicate(UseAVX == 0);
 5217   match(Set dst (AddVI dst src));
 5218   format %{ "paddd   $dst,$src\t! add packedI" %}
 5219   ins_encode %{
 5220     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
 5221   %}
 5222   ins_pipe( pipe_slow );
 5223 %}
 5224 
 5225 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
 5226   predicate(UseAVX > 0);
 5227   match(Set dst (AddVI src1 src2));
 5228   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
 5229   ins_encode %{
 5230     int vlen_enc = vector_length_encoding(this);
 5231     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5232   %}
 5233   ins_pipe( pipe_slow );
 5234 %}
 5235 
 5236 
 5237 instruct vaddI_mem(vec dst, vec src, memory mem) %{
 5238   predicate((UseAVX > 0) &&
 5239             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5240   match(Set dst (AddVI src (LoadVector mem)));
 5241   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
 5242   ins_encode %{
 5243     int vlen_enc = vector_length_encoding(this);
 5244     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5245   %}
 5246   ins_pipe( pipe_slow );
 5247 %}
 5248 
 5249 // Longs vector add
 5250 instruct vaddL(vec dst, vec src) %{
 5251   predicate(UseAVX == 0);
 5252   match(Set dst (AddVL dst src));
 5253   format %{ "paddq   $dst,$src\t! add packedL" %}
 5254   ins_encode %{
 5255     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
 5256   %}
 5257   ins_pipe( pipe_slow );
 5258 %}
 5259 
 5260 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
 5261   predicate(UseAVX > 0);
 5262   match(Set dst (AddVL src1 src2));
 5263   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
 5264   ins_encode %{
 5265     int vlen_enc = vector_length_encoding(this);
 5266     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5267   %}
 5268   ins_pipe( pipe_slow );
 5269 %}
 5270 
 5271 instruct vaddL_mem(vec dst, vec src, memory mem) %{
 5272   predicate((UseAVX > 0) &&
 5273             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5274   match(Set dst (AddVL src (LoadVector mem)));
 5275   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
 5276   ins_encode %{
 5277     int vlen_enc = vector_length_encoding(this);
 5278     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5279   %}
 5280   ins_pipe( pipe_slow );
 5281 %}
 5282 
 5283 // Floats vector add
 5284 instruct vaddF(vec dst, vec src) %{
 5285   predicate(UseAVX == 0);
 5286   match(Set dst (AddVF dst src));
 5287   format %{ "addps   $dst,$src\t! add packedF" %}
 5288   ins_encode %{
 5289     __ addps($dst$$XMMRegister, $src$$XMMRegister);
 5290   %}
 5291   ins_pipe( pipe_slow );
 5292 %}
 5293 
 5294 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
 5295   predicate(UseAVX > 0);
 5296   match(Set dst (AddVF src1 src2));
 5297   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
 5298   ins_encode %{
 5299     int vlen_enc = vector_length_encoding(this);
 5300     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5301   %}
 5302   ins_pipe( pipe_slow );
 5303 %}
 5304 
 5305 instruct vaddF_mem(vec dst, vec src, memory mem) %{
 5306   predicate((UseAVX > 0) &&
 5307             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5308   match(Set dst (AddVF src (LoadVector mem)));
 5309   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
 5310   ins_encode %{
 5311     int vlen_enc = vector_length_encoding(this);
 5312     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5313   %}
 5314   ins_pipe( pipe_slow );
 5315 %}
 5316 
 5317 // Doubles vector add
 5318 instruct vaddD(vec dst, vec src) %{
 5319   predicate(UseAVX == 0);
 5320   match(Set dst (AddVD dst src));
 5321   format %{ "addpd   $dst,$src\t! add packedD" %}
 5322   ins_encode %{
 5323     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
 5324   %}
 5325   ins_pipe( pipe_slow );
 5326 %}
 5327 
 5328 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
 5329   predicate(UseAVX > 0);
 5330   match(Set dst (AddVD src1 src2));
 5331   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
 5332   ins_encode %{
 5333     int vlen_enc = vector_length_encoding(this);
 5334     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5335   %}
 5336   ins_pipe( pipe_slow );
 5337 %}
 5338 
 5339 instruct vaddD_mem(vec dst, vec src, memory mem) %{
 5340   predicate((UseAVX > 0) &&
 5341             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5342   match(Set dst (AddVD src (LoadVector mem)));
 5343   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
 5344   ins_encode %{
 5345     int vlen_enc = vector_length_encoding(this);
 5346     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5347   %}
 5348   ins_pipe( pipe_slow );
 5349 %}
 5350 
 5351 // --------------------------------- SUB --------------------------------------
 5352 
 5353 // Bytes vector sub
 5354 instruct vsubB(vec dst, vec src) %{
 5355   predicate(UseAVX == 0);
 5356   match(Set dst (SubVB dst src));
 5357   format %{ "psubb   $dst,$src\t! sub packedB" %}
 5358   ins_encode %{
 5359     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
 5360   %}
 5361   ins_pipe( pipe_slow );
 5362 %}
 5363 
 5364 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
 5365   predicate(UseAVX > 0);
 5366   match(Set dst (SubVB src1 src2));
 5367   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
 5368   ins_encode %{
 5369     int vlen_enc = vector_length_encoding(this);
 5370     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5371   %}
 5372   ins_pipe( pipe_slow );
 5373 %}
 5374 
 5375 instruct vsubB_mem(vec dst, vec src, memory mem) %{
 5376   predicate((UseAVX > 0) &&
 5377             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5378   match(Set dst (SubVB src (LoadVector mem)));
 5379   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
 5380   ins_encode %{
 5381     int vlen_enc = vector_length_encoding(this);
 5382     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5383   %}
 5384   ins_pipe( pipe_slow );
 5385 %}
 5386 
 5387 // Shorts/Chars vector sub
 5388 instruct vsubS(vec dst, vec src) %{
 5389   predicate(UseAVX == 0);
 5390   match(Set dst (SubVS dst src));
 5391   format %{ "psubw   $dst,$src\t! sub packedS" %}
 5392   ins_encode %{
 5393     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
 5394   %}
 5395   ins_pipe( pipe_slow );
 5396 %}
 5397 
 5398 
 5399 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
 5400   predicate(UseAVX > 0);
 5401   match(Set dst (SubVS src1 src2));
 5402   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
 5403   ins_encode %{
 5404     int vlen_enc = vector_length_encoding(this);
 5405     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5406   %}
 5407   ins_pipe( pipe_slow );
 5408 %}
 5409 
 5410 instruct vsubS_mem(vec dst, vec src, memory mem) %{
 5411   predicate((UseAVX > 0) &&
 5412             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5413   match(Set dst (SubVS src (LoadVector mem)));
 5414   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
 5415   ins_encode %{
 5416     int vlen_enc = vector_length_encoding(this);
 5417     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5418   %}
 5419   ins_pipe( pipe_slow );
 5420 %}
 5421 
 5422 // Integers vector sub
 5423 instruct vsubI(vec dst, vec src) %{
 5424   predicate(UseAVX == 0);
 5425   match(Set dst (SubVI dst src));
 5426   format %{ "psubd   $dst,$src\t! sub packedI" %}
 5427   ins_encode %{
 5428     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
 5429   %}
 5430   ins_pipe( pipe_slow );
 5431 %}
 5432 
 5433 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
 5434   predicate(UseAVX > 0);
 5435   match(Set dst (SubVI src1 src2));
 5436   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
 5437   ins_encode %{
 5438     int vlen_enc = vector_length_encoding(this);
 5439     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5440   %}
 5441   ins_pipe( pipe_slow );
 5442 %}
 5443 
 5444 instruct vsubI_mem(vec dst, vec src, memory mem) %{
 5445   predicate((UseAVX > 0) &&
 5446             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5447   match(Set dst (SubVI src (LoadVector mem)));
 5448   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
 5449   ins_encode %{
 5450     int vlen_enc = vector_length_encoding(this);
 5451     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5452   %}
 5453   ins_pipe( pipe_slow );
 5454 %}
 5455 
 5456 // Longs vector sub
 5457 instruct vsubL(vec dst, vec src) %{
 5458   predicate(UseAVX == 0);
 5459   match(Set dst (SubVL dst src));
 5460   format %{ "psubq   $dst,$src\t! sub packedL" %}
 5461   ins_encode %{
 5462     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
 5463   %}
 5464   ins_pipe( pipe_slow );
 5465 %}
 5466 
 5467 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
 5468   predicate(UseAVX > 0);
 5469   match(Set dst (SubVL src1 src2));
 5470   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
 5471   ins_encode %{
 5472     int vlen_enc = vector_length_encoding(this);
 5473     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5474   %}
 5475   ins_pipe( pipe_slow );
 5476 %}
 5477 
 5478 
 5479 instruct vsubL_mem(vec dst, vec src, memory mem) %{
 5480   predicate((UseAVX > 0) &&
 5481             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5482   match(Set dst (SubVL src (LoadVector mem)));
 5483   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
 5484   ins_encode %{
 5485     int vlen_enc = vector_length_encoding(this);
 5486     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5487   %}
 5488   ins_pipe( pipe_slow );
 5489 %}
 5490 
 5491 // Floats vector sub
 5492 instruct vsubF(vec dst, vec src) %{
 5493   predicate(UseAVX == 0);
 5494   match(Set dst (SubVF dst src));
 5495   format %{ "subps   $dst,$src\t! sub packedF" %}
 5496   ins_encode %{
 5497     __ subps($dst$$XMMRegister, $src$$XMMRegister);
 5498   %}
 5499   ins_pipe( pipe_slow );
 5500 %}
 5501 
 5502 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
 5503   predicate(UseAVX > 0);
 5504   match(Set dst (SubVF src1 src2));
 5505   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
 5506   ins_encode %{
 5507     int vlen_enc = vector_length_encoding(this);
 5508     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5509   %}
 5510   ins_pipe( pipe_slow );
 5511 %}
 5512 
 5513 instruct vsubF_mem(vec dst, vec src, memory mem) %{
 5514   predicate((UseAVX > 0) &&
 5515             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5516   match(Set dst (SubVF src (LoadVector mem)));
 5517   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
 5518   ins_encode %{
 5519     int vlen_enc = vector_length_encoding(this);
 5520     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5521   %}
 5522   ins_pipe( pipe_slow );
 5523 %}
 5524 
 5525 // Doubles vector sub
 5526 instruct vsubD(vec dst, vec src) %{
 5527   predicate(UseAVX == 0);
 5528   match(Set dst (SubVD dst src));
 5529   format %{ "subpd   $dst,$src\t! sub packedD" %}
 5530   ins_encode %{
 5531     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
 5532   %}
 5533   ins_pipe( pipe_slow );
 5534 %}
 5535 
 5536 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
 5537   predicate(UseAVX > 0);
 5538   match(Set dst (SubVD src1 src2));
 5539   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
 5540   ins_encode %{
 5541     int vlen_enc = vector_length_encoding(this);
 5542     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5543   %}
 5544   ins_pipe( pipe_slow );
 5545 %}
 5546 
 5547 instruct vsubD_mem(vec dst, vec src, memory mem) %{
 5548   predicate((UseAVX > 0) &&
 5549             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5550   match(Set dst (SubVD src (LoadVector mem)));
 5551   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
 5552   ins_encode %{
 5553     int vlen_enc = vector_length_encoding(this);
 5554     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5555   %}
 5556   ins_pipe( pipe_slow );
 5557 %}
 5558 
 5559 // --------------------------------- MUL --------------------------------------
 5560 
 5561 // Byte vector mul
 5562 instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
 5563   predicate(Matcher::vector_length(n) == 4 ||
 5564             Matcher::vector_length(n) == 8);
 5565   match(Set dst (MulVB src1 src2));
 5566   effect(TEMP dst, TEMP tmp, TEMP scratch);
 5567   format %{"vector_mulB $dst,$src1,$src2" %}
 5568   ins_encode %{
 5569     assert(UseSSE > 3, "required");
 5570     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
 5571     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
 5572     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
 5573     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
 5574     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 5575     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 5576   %}
 5577   ins_pipe( pipe_slow );
 5578 %}
 5579 
 5580 instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
 5581   predicate(Matcher::vector_length(n) == 16 && UseAVX <= 1);
 5582   match(Set dst (MulVB src1 src2));
 5583   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
 5584   format %{"vector_mulB $dst,$src1,$src2" %}
 5585   ins_encode %{
 5586     assert(UseSSE > 3, "required");
 5587     __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
 5588     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
 5589     __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
 5590     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
 5591     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
 5592     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
 5593     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
 5594     __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
 5595     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
 5596     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 5597     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 5598     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 5599   %}
 5600   ins_pipe( pipe_slow );
 5601 %}
 5602 
 5603 instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
 5604   predicate(Matcher::vector_length(n) == 16 && UseAVX > 1);
 5605   match(Set dst (MulVB src1 src2));
 5606   effect(TEMP dst, TEMP tmp, TEMP scratch);
 5607   format %{"vector_mulB $dst,$src1,$src2" %}
 5608   ins_encode %{
 5609   int vlen_enc = Assembler::AVX_256bit;
 5610     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 5611     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5612     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5613     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
 5614     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 5615     __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
 5616     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
 5617   %}
 5618   ins_pipe( pipe_slow );
 5619 %}
 5620 
 5621 instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
 5622   predicate(Matcher::vector_length(n) == 32);
 5623   match(Set dst (MulVB src1 src2));
 5624   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
 5625   format %{"vector_mulB $dst,$src1,$src2" %}
 5626   ins_encode %{
 5627     assert(UseAVX > 1, "required");
 5628     int vlen_enc = Assembler::AVX_256bit;
 5629     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
 5630     __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
 5631     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 5632     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5633     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5634     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 5635     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5636     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5637     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
 5638     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5639     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5640     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 5641     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 5642     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 5643   %}
 5644   ins_pipe( pipe_slow );
 5645 %}
 5646 
 5647 instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
 5648   predicate(Matcher::vector_length(n) == 64);
 5649   match(Set dst (MulVB src1 src2));
 5650   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
 5651   format %{"vector_mulB $dst,$src1,$src2\n\t" %}
 5652   ins_encode %{
 5653     assert(UseAVX > 2, "required");
 5654     int vlen_enc = Assembler::AVX_512bit;
 5655     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
 5656     __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
 5657     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 5658     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5659     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5660     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 5661     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5662     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5663     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
 5664     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5665     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5666     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5667     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 5668     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
 5669     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5670   %}
 5671   ins_pipe( pipe_slow );
 5672 %}
 5673 
 5674 // Shorts/Chars vector mul
 5675 instruct vmulS(vec dst, vec src) %{
 5676   predicate(UseAVX == 0);
 5677   match(Set dst (MulVS dst src));
 5678   format %{ "pmullw $dst,$src\t! mul packedS" %}
 5679   ins_encode %{
 5680     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
 5681   %}
 5682   ins_pipe( pipe_slow );
 5683 %}
 5684 
 5685 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
 5686   predicate(UseAVX > 0);
 5687   match(Set dst (MulVS src1 src2));
 5688   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
 5689   ins_encode %{
 5690     int vlen_enc = vector_length_encoding(this);
 5691     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5692   %}
 5693   ins_pipe( pipe_slow );
 5694 %}
 5695 
 5696 instruct vmulS_mem(vec dst, vec src, memory mem) %{
 5697   predicate((UseAVX > 0) &&
 5698             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5699   match(Set dst (MulVS src (LoadVector mem)));
 5700   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
 5701   ins_encode %{
 5702     int vlen_enc = vector_length_encoding(this);
 5703     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5704   %}
 5705   ins_pipe( pipe_slow );
 5706 %}
 5707 
 5708 // Integers vector mul
 5709 instruct vmulI(vec dst, vec src) %{
 5710   predicate(UseAVX == 0);
 5711   match(Set dst (MulVI dst src));
 5712   format %{ "pmulld  $dst,$src\t! mul packedI" %}
 5713   ins_encode %{
 5714     assert(UseSSE > 3, "required");
 5715     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
 5716   %}
 5717   ins_pipe( pipe_slow );
 5718 %}
 5719 
 5720 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
 5721   predicate(UseAVX > 0);
 5722   match(Set dst (MulVI src1 src2));
 5723   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
 5724   ins_encode %{
 5725     int vlen_enc = vector_length_encoding(this);
 5726     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5727   %}
 5728   ins_pipe( pipe_slow );
 5729 %}
 5730 
 5731 instruct vmulI_mem(vec dst, vec src, memory mem) %{
 5732   predicate((UseAVX > 0) &&
 5733             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5734   match(Set dst (MulVI src (LoadVector mem)));
 5735   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
 5736   ins_encode %{
 5737     int vlen_enc = vector_length_encoding(this);
 5738     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5739   %}
 5740   ins_pipe( pipe_slow );
 5741 %}
 5742 
 5743 // Longs vector mul
 5744 instruct vmulL_reg(vec dst, vec src1, vec src2) %{
 5745   predicate(VM_Version::supports_avx512dq());
 5746   match(Set dst (MulVL src1 src2));
 5747   format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
 5748   ins_encode %{
 5749     assert(UseAVX > 2, "required");
 5750     int vlen_enc = vector_length_encoding(this);
 5751     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5752   %}
 5753   ins_pipe( pipe_slow );
 5754 %}
 5755 
 5756 instruct vmulL_mem(vec dst, vec src, memory mem) %{
 5757   predicate(VM_Version::supports_avx512dq() &&
 5758               (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5759   match(Set dst (MulVL src (LoadVector mem)));
 5760   format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
 5761   ins_encode %{
 5762     assert(UseAVX > 2, "required");
 5763     int vlen_enc = vector_length_encoding(this);
 5764     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5765   %}
 5766   ins_pipe( pipe_slow );
 5767 %}
 5768 
 5769 instruct mul2L_reg(vec dst, vec src2, legVec tmp) %{
 5770   predicate(Matcher::vector_length(n) == 2 && !VM_Version::supports_avx512dq());
 5771   match(Set dst (MulVL dst src2));
 5772   effect(TEMP dst, TEMP tmp);
 5773   format %{ "pshufd $tmp,$src2, 177\n\t"
 5774             "pmulld $tmp,$dst\n\t"
 5775             "phaddd $tmp,$tmp\n\t"
 5776             "pmovzxdq $tmp,$tmp\n\t"
 5777             "psllq $tmp, 32\n\t"
 5778             "pmuludq $dst,$src2\n\t"
 5779             "paddq $dst,$tmp\n\t! mul packed2L" %}
 5780 
 5781   ins_encode %{
 5782     assert(VM_Version::supports_sse4_1(), "required");
 5783     int vlen_enc = Assembler::AVX_128bit;
 5784     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
 5785     __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
 5786     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
 5787     __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
 5788     __ psllq($tmp$$XMMRegister, 32);
 5789     __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
 5790     __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
 5791   %}
 5792   ins_pipe( pipe_slow );
 5793 %}
 5794 
 5795 instruct vmul4L_reg_avx(vec dst, vec src1, vec src2, legVec tmp, legVec tmp1) %{
 5796   predicate(Matcher::vector_length(n) == 4 && !VM_Version::supports_avx512dq());
 5797   match(Set dst (MulVL src1 src2));
 5798   effect(TEMP tmp1, TEMP tmp);
 5799   format %{ "vpshufd $tmp,$src2\n\t"
 5800             "vpmulld $tmp,$src1,$tmp\n\t"
 5801             "vphaddd $tmp,$tmp,$tmp\n\t"
 5802             "vpmovzxdq $tmp,$tmp\n\t"
 5803             "vpsllq $tmp,$tmp\n\t"
 5804             "vpmuludq $tmp1,$src1,$src2\n\t"
 5805             "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
 5806   ins_encode %{
 5807     int vlen_enc = Assembler::AVX_256bit;
 5808     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vlen_enc);
 5809     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 5810     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
 5811     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 5812     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 5813     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vlen_enc);
 5814     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5815     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 5816   %}
 5817   ins_pipe( pipe_slow );
 5818 %}
 5819 
 5820 // Floats vector mul
 5821 instruct vmulF(vec dst, vec src) %{
 5822   predicate(UseAVX == 0);
 5823   match(Set dst (MulVF dst src));
 5824   format %{ "mulps   $dst,$src\t! mul packedF" %}
 5825   ins_encode %{
 5826     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
 5827   %}
 5828   ins_pipe( pipe_slow );
 5829 %}
 5830 
 5831 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
 5832   predicate(UseAVX > 0);
 5833   match(Set dst (MulVF src1 src2));
 5834   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
 5835   ins_encode %{
 5836     int vlen_enc = vector_length_encoding(this);
 5837     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5838   %}
 5839   ins_pipe( pipe_slow );
 5840 %}
 5841 
 5842 instruct vmulF_mem(vec dst, vec src, memory mem) %{
 5843   predicate((UseAVX > 0) &&
 5844             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5845   match(Set dst (MulVF src (LoadVector mem)));
 5846   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
 5847   ins_encode %{
 5848     int vlen_enc = vector_length_encoding(this);
 5849     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5850   %}
 5851   ins_pipe( pipe_slow );
 5852 %}
 5853 
 5854 // Doubles vector mul
 5855 instruct vmulD(vec dst, vec src) %{
 5856   predicate(UseAVX == 0);
 5857   match(Set dst (MulVD dst src));
 5858   format %{ "mulpd   $dst,$src\t! mul packedD" %}
 5859   ins_encode %{
 5860     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
 5861   %}
 5862   ins_pipe( pipe_slow );
 5863 %}
 5864 
 5865 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
 5866   predicate(UseAVX > 0);
 5867   match(Set dst (MulVD src1 src2));
 5868   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
 5869   ins_encode %{
 5870     int vlen_enc = vector_length_encoding(this);
 5871     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5872   %}
 5873   ins_pipe( pipe_slow );
 5874 %}
 5875 
 5876 instruct vmulD_mem(vec dst, vec src, memory mem) %{
 5877   predicate((UseAVX > 0) &&
 5878             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5879   match(Set dst (MulVD src (LoadVector mem)));
 5880   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
 5881   ins_encode %{
 5882     int vlen_enc = vector_length_encoding(this);
 5883     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5884   %}
 5885   ins_pipe( pipe_slow );
 5886 %}
 5887 
 5888 instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
 5889   predicate(Matcher::vector_length(n) == 8);
 5890   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
 5891   effect(TEMP dst, USE src1, USE src2);
 5892   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
 5893             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
 5894          %}
 5895   ins_encode %{
 5896     assert(UseAVX > 0, "required");
 5897 
 5898     int vlen_enc = Assembler::AVX_256bit;
 5899     int cond = (Assembler::Condition)($copnd$$cmpcode);
 5900     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
 5901     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5902   %}
 5903   ins_pipe( pipe_slow );
 5904 %}
 5905 
 5906 instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
 5907   predicate(Matcher::vector_length(n) == 4);
 5908   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
 5909   effect(TEMP dst, USE src1, USE src2);
 5910   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
 5911             "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
 5912          %}
 5913   ins_encode %{
 5914     assert(UseAVX > 0, "required");
 5915 
 5916     int vlen_enc = Assembler::AVX_256bit;
 5917     int cond = (Assembler::Condition)($copnd$$cmpcode);
 5918     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
 5919     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 5920   %}
 5921   ins_pipe( pipe_slow );
 5922 %}
 5923 
 5924 // --------------------------------- DIV --------------------------------------
 5925 
 5926 // Floats vector div
 5927 instruct vdivF(vec dst, vec src) %{
 5928   predicate(UseAVX == 0);
 5929   match(Set dst (DivVF dst src));
 5930   format %{ "divps   $dst,$src\t! div packedF" %}
 5931   ins_encode %{
 5932     __ divps($dst$$XMMRegister, $src$$XMMRegister);
 5933   %}
 5934   ins_pipe( pipe_slow );
 5935 %}
 5936 
 5937 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
 5938   predicate(UseAVX > 0);
 5939   match(Set dst (DivVF src1 src2));
 5940   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
 5941   ins_encode %{
 5942     int vlen_enc = vector_length_encoding(this);
 5943     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5944   %}
 5945   ins_pipe( pipe_slow );
 5946 %}
 5947 
 5948 instruct vdivF_mem(vec dst, vec src, memory mem) %{
 5949   predicate((UseAVX > 0) &&
 5950             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5951   match(Set dst (DivVF src (LoadVector mem)));
 5952   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
 5953   ins_encode %{
 5954     int vlen_enc = vector_length_encoding(this);
 5955     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5956   %}
 5957   ins_pipe( pipe_slow );
 5958 %}
 5959 
 5960 // Doubles vector div
 5961 instruct vdivD(vec dst, vec src) %{
 5962   predicate(UseAVX == 0);
 5963   match(Set dst (DivVD dst src));
 5964   format %{ "divpd   $dst,$src\t! div packedD" %}
 5965   ins_encode %{
 5966     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
 5967   %}
 5968   ins_pipe( pipe_slow );
 5969 %}
 5970 
 5971 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
 5972   predicate(UseAVX > 0);
 5973   match(Set dst (DivVD src1 src2));
 5974   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
 5975   ins_encode %{
 5976     int vlen_enc = vector_length_encoding(this);
 5977     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 5978   %}
 5979   ins_pipe( pipe_slow );
 5980 %}
 5981 
 5982 instruct vdivD_mem(vec dst, vec src, memory mem) %{
 5983   predicate((UseAVX > 0) &&
 5984             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 5985   match(Set dst (DivVD src (LoadVector mem)));
 5986   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
 5987   ins_encode %{
 5988     int vlen_enc = vector_length_encoding(this);
 5989     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 5990   %}
 5991   ins_pipe( pipe_slow );
 5992 %}
 5993 
 5994 // ------------------------------ MinMax ---------------------------------------
 5995 
 5996 // Byte, Short, Int vector Min/Max
 5997 instruct minmax_reg_sse(vec dst, vec src) %{
 5998   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 5999             UseAVX == 0);
 6000   match(Set dst (MinV dst src));
 6001   match(Set dst (MaxV dst src));
 6002   format %{ "vector_minmax  $dst,$src\t!  " %}
 6003   ins_encode %{
 6004     assert(UseSSE >= 4, "required");
 6005 
 6006     int opcode = this->ideal_Opcode();
 6007     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6008     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
 6009   %}
 6010   ins_pipe( pipe_slow );
 6011 %}
 6012 
 6013 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
 6014   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
 6015             UseAVX > 0);
 6016   match(Set dst (MinV src1 src2));
 6017   match(Set dst (MaxV src1 src2));
 6018   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
 6019   ins_encode %{
 6020     int opcode = this->ideal_Opcode();
 6021     int vlen_enc = vector_length_encoding(this);
 6022     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6023 
 6024     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6025   %}
 6026   ins_pipe( pipe_slow );
 6027 %}
 6028 
 6029 // Long vector Min/Max
 6030 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
 6031   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6032             UseAVX == 0);
 6033   match(Set dst (MinV dst src));
 6034   match(Set dst (MaxV src dst));
 6035   effect(TEMP dst, TEMP tmp);
 6036   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
 6037   ins_encode %{
 6038     assert(UseSSE >= 4, "required");
 6039 
 6040     int opcode = this->ideal_Opcode();
 6041     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6042     assert(elem_bt == T_LONG, "sanity");
 6043 
 6044     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
 6045   %}
 6046   ins_pipe( pipe_slow );
 6047 %}
 6048 
 6049 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
 6050   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
 6051             UseAVX > 0 && !VM_Version::supports_avx512vl());
 6052   match(Set dst (MinV src1 src2));
 6053   match(Set dst (MaxV src1 src2));
 6054   effect(TEMP dst);
 6055   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
 6056   ins_encode %{
 6057     int vlen_enc = vector_length_encoding(this);
 6058     int opcode = this->ideal_Opcode();
 6059     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6060     assert(elem_bt == T_LONG, "sanity");
 6061 
 6062     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6063   %}
 6064   ins_pipe( pipe_slow );
 6065 %}
 6066 
 6067 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
 6068   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
 6069             Matcher::vector_element_basic_type(n) == T_LONG);
 6070   match(Set dst (MinV src1 src2));
 6071   match(Set dst (MaxV src1 src2));
 6072   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
 6073   ins_encode %{
 6074     assert(UseAVX > 2, "required");
 6075 
 6076     int vlen_enc = vector_length_encoding(this);
 6077     int opcode = this->ideal_Opcode();
 6078     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6079     assert(elem_bt == T_LONG, "sanity");
 6080 
 6081     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6082   %}
 6083   ins_pipe( pipe_slow );
 6084 %}
 6085 
 6086 // Float/Double vector Min/Max
 6087 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
 6088   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
 6089             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
 6090             UseAVX > 0);
 6091   match(Set dst (MinV a b));
 6092   match(Set dst (MaxV a b));
 6093   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
 6094   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
 6095   ins_encode %{
 6096     assert(UseAVX > 0, "required");
 6097 
 6098     int opcode = this->ideal_Opcode();
 6099     int vlen_enc = vector_length_encoding(this);
 6100     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6101 
 6102     __ vminmax_fp(opcode, elem_bt,
 6103                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6104                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6105   %}
 6106   ins_pipe( pipe_slow );
 6107 %}
 6108 
 6109 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
 6110   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 6111             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
 6112   match(Set dst (MinV a b));
 6113   match(Set dst (MaxV a b));
 6114   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
 6115   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
 6116   ins_encode %{
 6117     assert(UseAVX > 2, "required");
 6118 
 6119     int opcode = this->ideal_Opcode();
 6120     int vlen_enc = vector_length_encoding(this);
 6121     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 6122 
 6123     __ evminmax_fp(opcode, elem_bt,
 6124                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
 6125                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
 6126   %}
 6127   ins_pipe( pipe_slow );
 6128 %}
 6129 
 6130 // --------------------------------- Signum/CopySign ---------------------------
 6131 
 6132 instruct signumF_reg(regF dst, regF zero, regF one, rRegP scratch, rFlagsReg cr) %{
 6133   match(Set dst (SignumF dst (Binary zero one)));
 6134   effect(TEMP scratch, KILL cr);
 6135   format %{ "signumF $dst, $dst\t! using $scratch as TEMP" %}
 6136   ins_encode %{
 6137     int opcode = this->ideal_Opcode();
 6138     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
 6139   %}
 6140   ins_pipe( pipe_slow );
 6141 %}
 6142 
 6143 instruct signumD_reg(regD dst, regD zero, regD one, rRegP scratch, rFlagsReg cr) %{
 6144   match(Set dst (SignumD dst (Binary zero one)));
 6145   effect(TEMP scratch, KILL cr);
 6146   format %{ "signumD $dst, $dst\t! using $scratch as TEMP" %}
 6147   ins_encode %{
 6148     int opcode = this->ideal_Opcode();
 6149     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
 6150   %}
 6151   ins_pipe( pipe_slow );
 6152 %}
 6153 
 6154 // ---------------------------------------
 6155 // For copySign use 0xE4 as writemask for vpternlog
 6156 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
 6157 // C (xmm2) is set to 0x7FFFFFFF
 6158 // Wherever xmm2 is 0, we want to pick from B (sign)
 6159 // Wherever xmm2 is 1, we want to pick from A (src)
 6160 //
 6161 // A B C Result
 6162 // 0 0 0 0
 6163 // 0 0 1 0
 6164 // 0 1 0 1
 6165 // 0 1 1 0
 6166 // 1 0 0 0
 6167 // 1 0 1 1
 6168 // 1 1 0 1
 6169 // 1 1 1 1
 6170 //
 6171 // Result going from high bit to low bit is 0x11100100 = 0xe4
 6172 // ---------------------------------------
 6173 
 6174 #ifdef _LP64
 6175 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
 6176   match(Set dst (CopySignF dst src));
 6177   effect(TEMP tmp1, TEMP tmp2);
 6178   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6179   ins_encode %{
 6180     __ movl($tmp2$$Register, 0x7FFFFFFF);
 6181     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
 6182     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6183   %}
 6184   ins_pipe( pipe_slow );
 6185 %}
 6186 
 6187 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
 6188   match(Set dst (CopySignD dst (Binary src zero)));
 6189   ins_cost(100);
 6190   effect(TEMP tmp1, TEMP tmp2);
 6191   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
 6192   ins_encode %{
 6193     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
 6194     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
 6195     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
 6196   %}
 6197   ins_pipe( pipe_slow );
 6198 %}
 6199 #endif // _LP64
 6200 
 6201 // --------------------------------- Sqrt --------------------------------------
 6202 
 6203 instruct vsqrtF_reg(vec dst, vec src) %{
 6204   match(Set dst (SqrtVF src));
 6205   ins_cost(400);
 6206   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
 6207   ins_encode %{
 6208     assert(UseAVX > 0, "required");
 6209     int vlen_enc = vector_length_encoding(this);
 6210     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6211   %}
 6212   ins_pipe( pipe_slow );
 6213 %}
 6214 
 6215 instruct vsqrtF_mem(vec dst, memory mem) %{
 6216   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6217   match(Set dst (SqrtVF (LoadVector mem)));
 6218   ins_cost(400);
 6219   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
 6220   ins_encode %{
 6221     assert(UseAVX > 0, "required");
 6222     int vlen_enc = vector_length_encoding(this);
 6223     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6224   %}
 6225   ins_pipe( pipe_slow );
 6226 %}
 6227 
 6228 // Floating point vector sqrt
 6229 instruct vsqrtD_reg(vec dst, vec src) %{
 6230   match(Set dst (SqrtVD src));
 6231   ins_cost(400);
 6232   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
 6233   ins_encode %{
 6234     assert(UseAVX > 0, "required");
 6235     int vlen_enc = vector_length_encoding(this);
 6236     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6237   %}
 6238   ins_pipe( pipe_slow );
 6239 %}
 6240 
 6241 instruct vsqrtD_mem(vec dst, memory mem) %{
 6242   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 6243   match(Set dst (SqrtVD (LoadVector mem)));
 6244   ins_cost(400);
 6245   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
 6246   ins_encode %{
 6247     assert(UseAVX > 0, "required");
 6248     int vlen_enc = vector_length_encoding(this);
 6249     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
 6250   %}
 6251   ins_pipe( pipe_slow );
 6252 %}
 6253 
 6254 // ------------------------------ Shift ---------------------------------------
 6255 
 6256 // Left and right shift count vectors are the same on x86
 6257 // (only lowest bits of xmm reg are used for count).
 6258 instruct vshiftcnt(vec dst, rRegI cnt) %{
 6259   match(Set dst (LShiftCntV cnt));
 6260   match(Set dst (RShiftCntV cnt));
 6261   format %{ "movdl    $dst,$cnt\t! load shift count" %}
 6262   ins_encode %{
 6263     __ movdl($dst$$XMMRegister, $cnt$$Register);
 6264   %}
 6265   ins_pipe( pipe_slow );
 6266 %}
 6267 
 6268 // Byte vector shift
 6269 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
 6270   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
 6271   match(Set dst ( LShiftVB src shift));
 6272   match(Set dst ( RShiftVB src shift));
 6273   match(Set dst (URShiftVB src shift));
 6274   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
 6275   format %{"vector_byte_shift $dst,$src,$shift" %}
 6276   ins_encode %{
 6277     assert(UseSSE > 3, "required");
 6278     int opcode = this->ideal_Opcode();
 6279     bool sign = (opcode != Op_URShiftVB);
 6280     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
 6281     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
 6282     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
 6283     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
 6284     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
 6285   %}
 6286   ins_pipe( pipe_slow );
 6287 %}
 6288 
 6289 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
 6290   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6291             UseAVX <= 1);
 6292   match(Set dst ( LShiftVB src shift));
 6293   match(Set dst ( RShiftVB src shift));
 6294   match(Set dst (URShiftVB src shift));
 6295   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
 6296   format %{"vector_byte_shift $dst,$src,$shift" %}
 6297   ins_encode %{
 6298     assert(UseSSE > 3, "required");
 6299     int opcode = this->ideal_Opcode();
 6300     bool sign = (opcode != Op_URShiftVB);
 6301     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
 6302     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
 6303     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
 6304     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
 6305     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
 6306     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
 6307     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
 6308     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
 6309     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
 6310   %}
 6311   ins_pipe( pipe_slow );
 6312 %}
 6313 
 6314 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
 6315   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
 6316             UseAVX > 1);
 6317   match(Set dst ( LShiftVB src shift));
 6318   match(Set dst ( RShiftVB src shift));
 6319   match(Set dst (URShiftVB src shift));
 6320   effect(TEMP dst, TEMP tmp, TEMP scratch);
 6321   format %{"vector_byte_shift $dst,$src,$shift" %}
 6322   ins_encode %{
 6323     int opcode = this->ideal_Opcode();
 6324     bool sign = (opcode != Op_URShiftVB);
 6325     int vlen_enc = Assembler::AVX_256bit;
 6326     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6327     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6328     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
 6329     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
 6330     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
 6331   %}
 6332   ins_pipe( pipe_slow );
 6333 %}
 6334 
 6335 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
 6336   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
 6337   match(Set dst ( LShiftVB src shift));
 6338   match(Set dst ( RShiftVB src shift));
 6339   match(Set dst (URShiftVB src shift));
 6340   effect(TEMP dst, TEMP tmp, TEMP scratch);
 6341   format %{"vector_byte_shift $dst,$src,$shift" %}
 6342   ins_encode %{
 6343     assert(UseAVX > 1, "required");
 6344     int opcode = this->ideal_Opcode();
 6345     bool sign = (opcode != Op_URShiftVB);
 6346     int vlen_enc = Assembler::AVX_256bit;
 6347     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
 6348     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6349     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6350     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6351     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6352     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
 6353     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
 6354     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6355     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6356   %}
 6357   ins_pipe( pipe_slow );
 6358 %}
 6359 
 6360 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
 6361   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
 6362   match(Set dst ( LShiftVB src shift));
 6363   match(Set dst  (RShiftVB src shift));
 6364   match(Set dst (URShiftVB src shift));
 6365   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
 6366   format %{"vector_byte_shift $dst,$src,$shift" %}
 6367   ins_encode %{
 6368     assert(UseAVX > 2, "required");
 6369     int opcode = this->ideal_Opcode();
 6370     bool sign = (opcode != Op_URShiftVB);
 6371     int vlen_enc = Assembler::AVX_512bit;
 6372     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
 6373     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
 6374     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6375     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6376     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6377     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
 6378     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6379     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6380     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6381     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
 6382     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
 6383     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6384   %}
 6385   ins_pipe( pipe_slow );
 6386 %}
 6387 
 6388 // Shorts vector logical right shift produces incorrect Java result
 6389 // for negative data because java code convert short value into int with
 6390 // sign extension before a shift. But char vectors are fine since chars are
 6391 // unsigned values.
 6392 // Shorts/Chars vector left shift
 6393 instruct vshiftS(vec dst, vec src, vec shift) %{
 6394   predicate(!n->as_ShiftV()->is_var_shift());
 6395   match(Set dst ( LShiftVS src shift));
 6396   match(Set dst ( RShiftVS src shift));
 6397   match(Set dst (URShiftVS src shift));
 6398   effect(TEMP dst, USE src, USE shift);
 6399   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
 6400   ins_encode %{
 6401     int opcode = this->ideal_Opcode();
 6402     if (UseAVX > 0) {
 6403       int vlen_enc = vector_length_encoding(this);
 6404       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6405     } else {
 6406       int vlen = Matcher::vector_length(this);
 6407       if (vlen == 2) {
 6408         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 6409         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6410       } else if (vlen == 4) {
 6411         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6412         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6413       } else {
 6414         assert (vlen == 8, "sanity");
 6415         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6416         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6417       }
 6418     }
 6419   %}
 6420   ins_pipe( pipe_slow );
 6421 %}
 6422 
 6423 // Integers vector left shift
 6424 instruct vshiftI(vec dst, vec src, vec shift) %{
 6425   predicate(!n->as_ShiftV()->is_var_shift());
 6426   match(Set dst ( LShiftVI src shift));
 6427   match(Set dst ( RShiftVI src shift));
 6428   match(Set dst (URShiftVI src shift));
 6429   effect(TEMP dst, USE src, USE shift);
 6430   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
 6431   ins_encode %{
 6432     int opcode = this->ideal_Opcode();
 6433     if (UseAVX > 0) {
 6434       int vlen_enc = vector_length_encoding(this);
 6435       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6436     } else {
 6437       int vlen = Matcher::vector_length(this);
 6438       if (vlen == 2) {
 6439         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6440         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6441       } else {
 6442         assert(vlen == 4, "sanity");
 6443         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6444         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6445       }
 6446     }
 6447   %}
 6448   ins_pipe( pipe_slow );
 6449 %}
 6450 
 6451 // Integers vector left constant shift
 6452 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
 6453   match(Set dst (LShiftVI src (LShiftCntV shift)));
 6454   match(Set dst (RShiftVI src (RShiftCntV shift)));
 6455   match(Set dst (URShiftVI src (RShiftCntV shift)));
 6456   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
 6457   ins_encode %{
 6458     int opcode = this->ideal_Opcode();
 6459     if (UseAVX > 0) {
 6460       int vector_len = vector_length_encoding(this);
 6461       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6462     } else {
 6463       int vlen = Matcher::vector_length(this);
 6464       if (vlen == 2) {
 6465         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
 6466         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6467       } else {
 6468         assert(vlen == 4, "sanity");
 6469         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6470         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6471       }
 6472     }
 6473   %}
 6474   ins_pipe( pipe_slow );
 6475 %}
 6476 
 6477 // Longs vector shift
 6478 instruct vshiftL(vec dst, vec src, vec shift) %{
 6479   predicate(!n->as_ShiftV()->is_var_shift());
 6480   match(Set dst ( LShiftVL src shift));
 6481   match(Set dst (URShiftVL src shift));
 6482   effect(TEMP dst, USE src, USE shift);
 6483   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
 6484   ins_encode %{
 6485     int opcode = this->ideal_Opcode();
 6486     if (UseAVX > 0) {
 6487       int vlen_enc = vector_length_encoding(this);
 6488       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6489     } else {
 6490       assert(Matcher::vector_length(this) == 2, "");
 6491       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6492       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
 6493     }
 6494   %}
 6495   ins_pipe( pipe_slow );
 6496 %}
 6497 
 6498 // Longs vector constant shift
 6499 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
 6500   match(Set dst (LShiftVL src (LShiftCntV shift)));
 6501   match(Set dst (URShiftVL src (RShiftCntV shift)));
 6502   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
 6503   ins_encode %{
 6504     int opcode = this->ideal_Opcode();
 6505     if (UseAVX > 0) {
 6506       int vector_len = vector_length_encoding(this);
 6507       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 6508     } else {
 6509       assert(Matcher::vector_length(this) == 2, "");
 6510       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6511       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
 6512     }
 6513   %}
 6514   ins_pipe( pipe_slow );
 6515 %}
 6516 
 6517 // -------------------ArithmeticRightShift -----------------------------------
 6518 // Long vector arithmetic right shift
 6519 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
 6520   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
 6521   match(Set dst (RShiftVL src shift));
 6522   effect(TEMP dst, TEMP tmp, TEMP scratch);
 6523   format %{ "vshiftq $dst,$src,$shift" %}
 6524   ins_encode %{
 6525     uint vlen = Matcher::vector_length(this);
 6526     if (vlen == 2) {
 6527       assert(UseSSE >= 2, "required");
 6528       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
 6529       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
 6530       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
 6531       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
 6532       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
 6533       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
 6534     } else {
 6535       assert(vlen == 4, "sanity");
 6536       assert(UseAVX > 1, "required");
 6537       int vlen_enc = Assembler::AVX_256bit;
 6538       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6539       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
 6540       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6541       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6542       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
 6543     }
 6544   %}
 6545   ins_pipe( pipe_slow );
 6546 %}
 6547 
 6548 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
 6549   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
 6550   match(Set dst (RShiftVL src shift));
 6551   format %{ "vshiftq $dst,$src,$shift" %}
 6552   ins_encode %{
 6553     int vlen_enc = vector_length_encoding(this);
 6554     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6555   %}
 6556   ins_pipe( pipe_slow );
 6557 %}
 6558 
 6559 // ------------------- Variable Shift -----------------------------
 6560 // Byte variable shift
 6561 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
 6562   predicate(Matcher::vector_length(n) <= 8 &&
 6563             n->as_ShiftV()->is_var_shift() &&
 6564             !VM_Version::supports_avx512bw());
 6565   match(Set dst ( LShiftVB src shift));
 6566   match(Set dst ( RShiftVB src shift));
 6567   match(Set dst (URShiftVB src shift));
 6568   effect(TEMP dst, TEMP vtmp, TEMP scratch);
 6569   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
 6570   ins_encode %{
 6571     assert(UseAVX >= 2, "required");
 6572 
 6573     int opcode = this->ideal_Opcode();
 6574     int vlen_enc = Assembler::AVX_128bit;
 6575     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
 6576     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 6577   %}
 6578   ins_pipe( pipe_slow );
 6579 %}
 6580 
 6581 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
 6582   predicate(Matcher::vector_length(n) == 16 &&
 6583             n->as_ShiftV()->is_var_shift() &&
 6584             !VM_Version::supports_avx512bw());
 6585   match(Set dst ( LShiftVB src shift));
 6586   match(Set dst ( RShiftVB src shift));
 6587   match(Set dst (URShiftVB src shift));
 6588   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
 6589   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
 6590   ins_encode %{
 6591     assert(UseAVX >= 2, "required");
 6592 
 6593     int opcode = this->ideal_Opcode();
 6594     int vlen_enc = Assembler::AVX_128bit;
 6595     // Shift lower half and get word result in dst
 6596     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
 6597 
 6598     // Shift upper half and get word result in vtmp1
 6599     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6600     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6601     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
 6602 
 6603     // Merge and down convert the two word results to byte in dst
 6604     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6605   %}
 6606   ins_pipe( pipe_slow );
 6607 %}
 6608 
 6609 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{
 6610   predicate(Matcher::vector_length(n) == 32 &&
 6611             n->as_ShiftV()->is_var_shift() &&
 6612             !VM_Version::supports_avx512bw());
 6613   match(Set dst ( LShiftVB src shift));
 6614   match(Set dst ( RShiftVB src shift));
 6615   match(Set dst (URShiftVB src shift));
 6616   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch);
 6617   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %}
 6618   ins_encode %{
 6619     assert(UseAVX >= 2, "required");
 6620 
 6621     int opcode = this->ideal_Opcode();
 6622     int vlen_enc = Assembler::AVX_128bit;
 6623     // Process lower 128 bits and get result in dst
 6624     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
 6625     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
 6626     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
 6627     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
 6628     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6629 
 6630     // Process higher 128 bits and get result in vtmp3
 6631     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6632     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6633     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register);
 6634     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
 6635     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
 6636     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
 6637     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
 6638 
 6639     // Merge the two results in dst
 6640     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6641   %}
 6642   ins_pipe( pipe_slow );
 6643 %}
 6644 
 6645 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
 6646   predicate(Matcher::vector_length(n) <= 32 &&
 6647             n->as_ShiftV()->is_var_shift() &&
 6648             VM_Version::supports_avx512bw());
 6649   match(Set dst ( LShiftVB src shift));
 6650   match(Set dst ( RShiftVB src shift));
 6651   match(Set dst (URShiftVB src shift));
 6652   effect(TEMP dst, TEMP vtmp, TEMP scratch);
 6653   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
 6654   ins_encode %{
 6655     assert(UseAVX > 2, "required");
 6656 
 6657     int opcode = this->ideal_Opcode();
 6658     int vlen_enc = vector_length_encoding(this);
 6659     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
 6660   %}
 6661   ins_pipe( pipe_slow );
 6662 %}
 6663 
 6664 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
 6665   predicate(Matcher::vector_length(n) == 64 &&
 6666             n->as_ShiftV()->is_var_shift() &&
 6667             VM_Version::supports_avx512bw());
 6668   match(Set dst ( LShiftVB src shift));
 6669   match(Set dst ( RShiftVB src shift));
 6670   match(Set dst (URShiftVB src shift));
 6671   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
 6672   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
 6673   ins_encode %{
 6674     assert(UseAVX > 2, "required");
 6675 
 6676     int opcode = this->ideal_Opcode();
 6677     int vlen_enc = Assembler::AVX_256bit;
 6678     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
 6679     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
 6680     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
 6681     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
 6682     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
 6683   %}
 6684   ins_pipe( pipe_slow );
 6685 %}
 6686 
 6687 // Short variable shift
 6688 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
 6689   predicate(Matcher::vector_length(n) <= 8 &&
 6690             n->as_ShiftV()->is_var_shift() &&
 6691             !VM_Version::supports_avx512bw());
 6692   match(Set dst ( LShiftVS src shift));
 6693   match(Set dst ( RShiftVS src shift));
 6694   match(Set dst (URShiftVS src shift));
 6695   effect(TEMP dst, TEMP vtmp, TEMP scratch);
 6696   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6697   ins_encode %{
 6698     assert(UseAVX >= 2, "required");
 6699 
 6700     int opcode = this->ideal_Opcode();
 6701     bool sign = (opcode != Op_URShiftVS);
 6702     int vlen_enc = Assembler::AVX_256bit;
 6703     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
 6704     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
 6705     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 6706     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
 6707     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
 6708     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 6709   %}
 6710   ins_pipe( pipe_slow );
 6711 %}
 6712 
 6713 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
 6714   predicate(Matcher::vector_length(n) == 16 &&
 6715             n->as_ShiftV()->is_var_shift() &&
 6716             !VM_Version::supports_avx512bw());
 6717   match(Set dst ( LShiftVS src shift));
 6718   match(Set dst ( RShiftVS src shift));
 6719   match(Set dst (URShiftVS src shift));
 6720   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
 6721   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
 6722   ins_encode %{
 6723     assert(UseAVX >= 2, "required");
 6724 
 6725     int opcode = this->ideal_Opcode();
 6726     bool sign = (opcode != Op_URShiftVS);
 6727     int vlen_enc = Assembler::AVX_256bit;
 6728     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
 6729     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6730     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6731     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6732     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
 6733 
 6734     // Shift upper half, with result in dst using vtmp1 as TEMP
 6735     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
 6736     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
 6737     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6738     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6739     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
 6740     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
 6741 
 6742     // Merge lower and upper half result into dst
 6743     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6744     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
 6745   %}
 6746   ins_pipe( pipe_slow );
 6747 %}
 6748 
 6749 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
 6750   predicate(n->as_ShiftV()->is_var_shift() &&
 6751             VM_Version::supports_avx512bw());
 6752   match(Set dst ( LShiftVS src shift));
 6753   match(Set dst ( RShiftVS src shift));
 6754   match(Set dst (URShiftVS src shift));
 6755   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
 6756   ins_encode %{
 6757     assert(UseAVX > 2, "required");
 6758 
 6759     int opcode = this->ideal_Opcode();
 6760     int vlen_enc = vector_length_encoding(this);
 6761     if (!VM_Version::supports_avx512vl()) {
 6762       vlen_enc = Assembler::AVX_512bit;
 6763     }
 6764     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6765   %}
 6766   ins_pipe( pipe_slow );
 6767 %}
 6768 
 6769 //Integer variable shift
 6770 instruct vshiftI_var(vec dst, vec src, vec shift) %{
 6771   predicate(n->as_ShiftV()->is_var_shift());
 6772   match(Set dst ( LShiftVI src shift));
 6773   match(Set dst ( RShiftVI src shift));
 6774   match(Set dst (URShiftVI src shift));
 6775   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
 6776   ins_encode %{
 6777     assert(UseAVX >= 2, "required");
 6778 
 6779     int opcode = this->ideal_Opcode();
 6780     int vlen_enc = vector_length_encoding(this);
 6781     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6782   %}
 6783   ins_pipe( pipe_slow );
 6784 %}
 6785 
 6786 //Long variable shift
 6787 instruct vshiftL_var(vec dst, vec src, vec shift) %{
 6788   predicate(n->as_ShiftV()->is_var_shift());
 6789   match(Set dst ( LShiftVL src shift));
 6790   match(Set dst (URShiftVL src shift));
 6791   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
 6792   ins_encode %{
 6793     assert(UseAVX >= 2, "required");
 6794 
 6795     int opcode = this->ideal_Opcode();
 6796     int vlen_enc = vector_length_encoding(this);
 6797     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6798   %}
 6799   ins_pipe( pipe_slow );
 6800 %}
 6801 
 6802 //Long variable right shift arithmetic
 6803 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
 6804   predicate(Matcher::vector_length(n) <= 4 &&
 6805             n->as_ShiftV()->is_var_shift() &&
 6806             UseAVX == 2);
 6807   match(Set dst (RShiftVL src shift));
 6808   effect(TEMP dst, TEMP vtmp);
 6809   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
 6810   ins_encode %{
 6811     int opcode = this->ideal_Opcode();
 6812     int vlen_enc = vector_length_encoding(this);
 6813     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
 6814                  $vtmp$$XMMRegister);
 6815   %}
 6816   ins_pipe( pipe_slow );
 6817 %}
 6818 
 6819 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
 6820   predicate(n->as_ShiftV()->is_var_shift() &&
 6821             UseAVX > 2);
 6822   match(Set dst (RShiftVL src shift));
 6823   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
 6824   ins_encode %{
 6825     int opcode = this->ideal_Opcode();
 6826     int vlen_enc = vector_length_encoding(this);
 6827     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
 6828   %}
 6829   ins_pipe( pipe_slow );
 6830 %}
 6831 
 6832 // --------------------------------- AND --------------------------------------
 6833 
 6834 instruct vand(vec dst, vec src) %{
 6835   predicate(UseAVX == 0);
 6836   match(Set dst (AndV dst src));
 6837   format %{ "pand    $dst,$src\t! and vectors" %}
 6838   ins_encode %{
 6839     __ pand($dst$$XMMRegister, $src$$XMMRegister);
 6840   %}
 6841   ins_pipe( pipe_slow );
 6842 %}
 6843 
 6844 instruct vand_reg(vec dst, vec src1, vec src2) %{
 6845   predicate(UseAVX > 0);
 6846   match(Set dst (AndV src1 src2));
 6847   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
 6848   ins_encode %{
 6849     int vlen_enc = vector_length_encoding(this);
 6850     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6851   %}
 6852   ins_pipe( pipe_slow );
 6853 %}
 6854 
 6855 instruct vand_mem(vec dst, vec src, memory mem) %{
 6856   predicate((UseAVX > 0) &&
 6857             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6858   match(Set dst (AndV src (LoadVector mem)));
 6859   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
 6860   ins_encode %{
 6861     int vlen_enc = vector_length_encoding(this);
 6862     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6863   %}
 6864   ins_pipe( pipe_slow );
 6865 %}
 6866 
 6867 // --------------------------------- OR ---------------------------------------
 6868 
 6869 instruct vor(vec dst, vec src) %{
 6870   predicate(UseAVX == 0);
 6871   match(Set dst (OrV dst src));
 6872   format %{ "por     $dst,$src\t! or vectors" %}
 6873   ins_encode %{
 6874     __ por($dst$$XMMRegister, $src$$XMMRegister);
 6875   %}
 6876   ins_pipe( pipe_slow );
 6877 %}
 6878 
 6879 instruct vor_reg(vec dst, vec src1, vec src2) %{
 6880   predicate(UseAVX > 0);
 6881   match(Set dst (OrV src1 src2));
 6882   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
 6883   ins_encode %{
 6884     int vlen_enc = vector_length_encoding(this);
 6885     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6886   %}
 6887   ins_pipe( pipe_slow );
 6888 %}
 6889 
 6890 instruct vor_mem(vec dst, vec src, memory mem) %{
 6891   predicate((UseAVX > 0) &&
 6892             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6893   match(Set dst (OrV src (LoadVector mem)));
 6894   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
 6895   ins_encode %{
 6896     int vlen_enc = vector_length_encoding(this);
 6897     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6898   %}
 6899   ins_pipe( pipe_slow );
 6900 %}
 6901 
 6902 // --------------------------------- XOR --------------------------------------
 6903 
 6904 instruct vxor(vec dst, vec src) %{
 6905   predicate(UseAVX == 0);
 6906   match(Set dst (XorV dst src));
 6907   format %{ "pxor    $dst,$src\t! xor vectors" %}
 6908   ins_encode %{
 6909     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
 6910   %}
 6911   ins_pipe( pipe_slow );
 6912 %}
 6913 
 6914 instruct vxor_reg(vec dst, vec src1, vec src2) %{
 6915   predicate(UseAVX > 0);
 6916   match(Set dst (XorV src1 src2));
 6917   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
 6918   ins_encode %{
 6919     int vlen_enc = vector_length_encoding(this);
 6920     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 6921   %}
 6922   ins_pipe( pipe_slow );
 6923 %}
 6924 
 6925 instruct vxor_mem(vec dst, vec src, memory mem) %{
 6926   predicate((UseAVX > 0) &&
 6927             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
 6928   match(Set dst (XorV src (LoadVector mem)));
 6929   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
 6930   ins_encode %{
 6931     int vlen_enc = vector_length_encoding(this);
 6932     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
 6933   %}
 6934   ins_pipe( pipe_slow );
 6935 %}
 6936 
 6937 // --------------------------------- VectorCast --------------------------------------
 6938 
 6939 instruct vcastBtoX(vec dst, vec src) %{
 6940   match(Set dst (VectorCastB2X src));
 6941   format %{ "vector_cast_b2x $dst,$src\t!" %}
 6942   ins_encode %{
 6943     assert(UseAVX > 0, "required");
 6944 
 6945     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 6946     int vlen_enc = vector_length_encoding(this);
 6947     switch (to_elem_bt) {
 6948       case T_SHORT:
 6949         __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6950         break;
 6951       case T_INT:
 6952         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6953         break;
 6954       case T_FLOAT:
 6955         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6956         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6957         break;
 6958       case T_LONG:
 6959         __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 6960         break;
 6961       case T_DOUBLE: {
 6962         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 6963         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 6964         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 6965         break;
 6966       }
 6967       default: assert(false, "%s", type2name(to_elem_bt));
 6968     }
 6969   %}
 6970   ins_pipe( pipe_slow );
 6971 %}
 6972 
 6973 instruct castStoX(vec dst, vec src, rRegP scratch) %{
 6974   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 6975             Matcher::vector_length(n->in(1)) <= 8 && // src
 6976             Matcher::vector_element_basic_type(n) == T_BYTE);
 6977   effect(TEMP scratch);
 6978   match(Set dst (VectorCastS2X src));
 6979   format %{ "vector_cast_s2x $dst,$src\t! using $scratch as TEMP" %}
 6980   ins_encode %{
 6981     assert(UseAVX > 0, "required");
 6982 
 6983     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, $scratch$$Register);
 6984     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
 6985   %}
 6986   ins_pipe( pipe_slow );
 6987 %}
 6988 
 6989 instruct vcastStoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
 6990   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
 6991             Matcher::vector_length(n->in(1)) == 16 && // src
 6992             Matcher::vector_element_basic_type(n) == T_BYTE);
 6993   effect(TEMP dst, TEMP vtmp, TEMP scratch);
 6994   match(Set dst (VectorCastS2X src));
 6995   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp, $scratch as TEMP" %}
 6996   ins_encode %{
 6997     assert(UseAVX > 0, "required");
 6998 
 6999     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
 7000     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
 7001     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 7002     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
 7003   %}
 7004   ins_pipe( pipe_slow );
 7005 %}
 7006 
 7007 instruct vcastStoX_evex(vec dst, vec src) %{
 7008   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
 7009             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7010   match(Set dst (VectorCastS2X src));
 7011   format %{ "vector_cast_s2x $dst,$src\t!" %}
 7012   ins_encode %{
 7013     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7014     int src_vlen_enc = vector_length_encoding(this, $src);
 7015     int vlen_enc = vector_length_encoding(this);
 7016     switch (to_elem_bt) {
 7017       case T_BYTE:
 7018         if (!VM_Version::supports_avx512vl()) {
 7019           vlen_enc = Assembler::AVX_512bit;
 7020         }
 7021         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7022         break;
 7023       case T_INT:
 7024         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7025         break;
 7026       case T_FLOAT:
 7027         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7028         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7029         break;
 7030       case T_LONG:
 7031         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7032         break;
 7033       case T_DOUBLE: {
 7034         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
 7035         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
 7036         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7037         break;
 7038       }
 7039       default:
 7040         ShouldNotReachHere();
 7041     }
 7042   %}
 7043   ins_pipe( pipe_slow );
 7044 %}
 7045 
 7046 instruct castItoX(vec dst, vec src, rRegP scratch) %{
 7047   predicate(UseAVX <= 2 &&
 7048             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
 7049             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7050   match(Set dst (VectorCastI2X src));
 7051   format %{ "vector_cast_i2x $dst,$src\t! using $scratch as TEMP" %}
 7052   effect(TEMP scratch);
 7053   ins_encode %{
 7054     assert(UseAVX > 0, "required");
 7055 
 7056     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7057     int vlen_enc = vector_length_encoding(this, $src);
 7058 
 7059     if (to_elem_bt == T_BYTE) {
 7060       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
 7061       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7062       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7063     } else {
 7064       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7065       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
 7066       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7067     }
 7068   %}
 7069   ins_pipe( pipe_slow );
 7070 %}
 7071 
 7072 instruct vcastItoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
 7073   predicate(UseAVX <= 2 &&
 7074             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
 7075             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
 7076   match(Set dst (VectorCastI2X src));
 7077   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp and $scratch as TEMP" %}
 7078   effect(TEMP dst, TEMP vtmp, TEMP scratch);
 7079   ins_encode %{
 7080     assert(UseAVX > 0, "required");
 7081 
 7082     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7083     int vlen_enc = vector_length_encoding(this, $src);
 7084 
 7085     if (to_elem_bt == T_BYTE) {
 7086       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
 7087       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7088       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7089       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7090     } else {
 7091       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
 7092       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
 7093       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
 7094       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 7095     }
 7096   %}
 7097   ins_pipe( pipe_slow );
 7098 %}
 7099 
 7100 instruct vcastItoX_evex(vec dst, vec src) %{
 7101   predicate(UseAVX > 2 ||
 7102             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
 7103   match(Set dst (VectorCastI2X src));
 7104   format %{ "vector_cast_i2x $dst,$src\t!" %}
 7105   ins_encode %{
 7106     assert(UseAVX > 0, "required");
 7107 
 7108     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
 7109     int src_vlen_enc = vector_length_encoding(this, $src);
 7110     int dst_vlen_enc = vector_length_encoding(this);
 7111     switch (dst_elem_bt) {
 7112       case T_BYTE:
 7113         if (!VM_Version::supports_avx512vl()) {
 7114           src_vlen_enc = Assembler::AVX_512bit;
 7115         }
 7116         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7117         break;
 7118       case T_SHORT:
 7119         if (!VM_Version::supports_avx512vl()) {
 7120           src_vlen_enc = Assembler::AVX_512bit;
 7121         }
 7122         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 7123         break;
 7124       case T_FLOAT:
 7125         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7126         break;
 7127       case T_LONG:
 7128         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7129         break;
 7130       case T_DOUBLE:
 7131         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
 7132         break;
 7133       default:
 7134         ShouldNotReachHere();
 7135     }
 7136   %}
 7137   ins_pipe( pipe_slow );
 7138 %}
 7139 
 7140 instruct vcastLtoBS(vec dst, vec src, rRegP scratch) %{
 7141   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
 7142             UseAVX <= 2);
 7143   match(Set dst (VectorCastL2X src));
 7144   effect(TEMP scratch);
 7145   format %{ "vector_cast_l2x  $dst,$src\t! using $scratch as TEMP" %}
 7146   ins_encode %{
 7147     assert(UseAVX > 0, "required");
 7148 
 7149     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7150     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
 7151     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
 7152                                                       : ExternalAddress(vector_int_to_short_mask());
 7153     if (vlen <= 16) {
 7154       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
 7155       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
 7156       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7157     } else {
 7158       assert(vlen <= 32, "required");
 7159       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
 7160       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
 7161       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
 7162       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7163     }
 7164     if (to_elem_bt == T_BYTE) {
 7165       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
 7166     }
 7167   %}
 7168   ins_pipe( pipe_slow );
 7169 %}
 7170 
 7171 instruct vcastLtoX_evex(vec dst, vec src) %{
 7172   predicate(UseAVX > 2 ||
 7173             (Matcher::vector_element_basic_type(n) == T_INT ||
 7174              Matcher::vector_element_basic_type(n) == T_FLOAT ||
 7175              Matcher::vector_element_basic_type(n) == T_DOUBLE));
 7176   match(Set dst (VectorCastL2X src));
 7177   format %{ "vector_cast_l2x  $dst,$src\t!" %}
 7178   ins_encode %{
 7179     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7180     int vlen = Matcher::vector_length_in_bytes(this, $src);
 7181     int vlen_enc = vector_length_encoding(this, $src);
 7182     switch (to_elem_bt) {
 7183       case T_BYTE:
 7184         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7185           vlen_enc = Assembler::AVX_512bit;
 7186         }
 7187         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7188         break;
 7189       case T_SHORT:
 7190         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
 7191           vlen_enc = Assembler::AVX_512bit;
 7192         }
 7193         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7194         break;
 7195       case T_INT:
 7196         if (vlen == 8) {
 7197           if ($dst$$XMMRegister != $src$$XMMRegister) {
 7198             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
 7199           }
 7200         } else if (vlen == 16) {
 7201           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
 7202         } else if (vlen == 32) {
 7203           if (UseAVX > 2) {
 7204             if (!VM_Version::supports_avx512vl()) {
 7205               vlen_enc = Assembler::AVX_512bit;
 7206             }
 7207             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7208           } else {
 7209             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
 7210             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 7211           }
 7212         } else { // vlen == 64
 7213           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7214         }
 7215         break;
 7216       case T_FLOAT:
 7217         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7218         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7219         break;
 7220       case T_DOUBLE:
 7221         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
 7222         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7223         break;
 7224 
 7225       default: assert(false, "%s", type2name(to_elem_bt));
 7226     }
 7227   %}
 7228   ins_pipe( pipe_slow );
 7229 %}
 7230 
 7231 instruct vcastFtoD_reg(vec dst, vec src) %{
 7232   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
 7233   match(Set dst (VectorCastF2X src));
 7234   format %{ "vector_cast_f2d  $dst,$src\t!" %}
 7235   ins_encode %{
 7236     int vlen_enc = vector_length_encoding(this);
 7237     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7238   %}
 7239   ins_pipe( pipe_slow );
 7240 %}
 7241 
 7242 
 7243 instruct castFtoI_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rRegP scratch, rFlagsReg cr) %{
 7244   predicate(!VM_Version::supports_avx512vl() &&
 7245             Matcher::vector_length_in_bytes(n) < 64 &&
 7246             Matcher::vector_element_basic_type(n) == T_INT);
 7247   match(Set dst (VectorCastF2X src));
 7248   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP scratch, KILL cr);
 7249   format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $scratch as TEMP" %}
 7250   ins_encode %{
 7251     int vlen_enc = vector_length_encoding(this);
 7252     __ vector_castF2I_avx($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7253                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7254                           ExternalAddress(vector_float_signflip()), $scratch$$Register, vlen_enc);
 7255   %}
 7256   ins_pipe( pipe_slow );
 7257 %}
 7258 
 7259 instruct castFtoI_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
 7260   predicate((VM_Version::supports_avx512vl() ||
 7261              Matcher::vector_length_in_bytes(n) == 64) &&
 7262              Matcher::vector_element_basic_type(n) == T_INT);
 7263   match(Set dst (VectorCastF2X src));
 7264   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr);
 7265   format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 and $scratch as TEMP" %}
 7266   ins_encode %{
 7267     int vlen_enc = vector_length_encoding(this);
 7268     __ vector_castF2I_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7269                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7270                            ExternalAddress(vector_float_signflip()), $scratch$$Register, vlen_enc);
 7271   %}
 7272   ins_pipe( pipe_slow );
 7273 %}
 7274 
 7275 instruct vcastDtoF_reg(vec dst, vec src) %{
 7276   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
 7277   match(Set dst (VectorCastD2X src));
 7278   format %{ "vector_cast_d2x  $dst,$src\t!" %}
 7279   ins_encode %{
 7280     int vlen_enc = vector_length_encoding(this, $src);
 7281     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7282   %}
 7283   ins_pipe( pipe_slow );
 7284 %}
 7285 
 7286 instruct castDtoL_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
 7287   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7288   match(Set dst (VectorCastD2X src));
 7289   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr);
 7290   format %{ "vector_cast_d2l $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 and $scratch as TEMP" %}
 7291   ins_encode %{
 7292     int vlen_enc = vector_length_encoding(this);
 7293     __ vector_castD2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7294                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7295                            ExternalAddress(vector_double_signflip()), $scratch$$Register, vlen_enc);
 7296   %}
 7297   ins_pipe( pipe_slow );
 7298 %}
 7299 
 7300 instruct vucast(vec dst, vec src) %{
 7301   match(Set dst (VectorUCastB2X src));
 7302   match(Set dst (VectorUCastS2X src));
 7303   match(Set dst (VectorUCastI2X src));
 7304   format %{ "vector_ucast $dst,$src\t!" %}
 7305   ins_encode %{
 7306     assert(UseAVX > 0, "required");
 7307 
 7308     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
 7309     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
 7310     int vlen_enc = vector_length_encoding(this);
 7311     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
 7312   %}
 7313   ins_pipe( pipe_slow );
 7314 %}
 7315 
 7316 #ifdef _LP64
 7317 instruct vround_float_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rRegP scratch, rFlagsReg cr) %{
 7318   predicate(!VM_Version::supports_avx512vl() &&
 7319             Matcher::vector_length_in_bytes(n) < 64 &&
 7320             Matcher::vector_element_basic_type(n) == T_INT);
 7321   match(Set dst (RoundVF src));
 7322   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP scratch, KILL cr);
 7323   format %{ "vector_round_float $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $scratch as TEMP" %}
 7324   ins_encode %{
 7325     int vlen_enc = vector_length_encoding(this);
 7326     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
 7327     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7328                               $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 7329                               ExternalAddress(vector_float_signflip()), new_mxcsr, $scratch$$Register, vlen_enc);
 7330   %}
 7331   ins_pipe( pipe_slow );
 7332 %}
 7333 
 7334 instruct vround_float_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
 7335   predicate((VM_Version::supports_avx512vl() ||
 7336              Matcher::vector_length_in_bytes(n) == 64) &&
 7337              Matcher::vector_element_basic_type(n) == T_INT);
 7338   match(Set dst (RoundVF src));
 7339   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr);
 7340   format %{ "vector_round_float $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 and $scratch as TEMP" %}
 7341   ins_encode %{
 7342     int vlen_enc = vector_length_encoding(this);
 7343     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
 7344     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7345                                $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7346                                ExternalAddress(vector_float_signflip()), new_mxcsr, $scratch$$Register, vlen_enc);
 7347   %}
 7348   ins_pipe( pipe_slow );
 7349 %}
 7350 
 7351 instruct vround_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
 7352   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
 7353   match(Set dst (RoundVD src));
 7354   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr);
 7355   format %{ "vector_round_long $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 and $scratch as TEMP" %}
 7356   ins_encode %{
 7357     int vlen_enc = vector_length_encoding(this);
 7358     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
 7359     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 7360                                 $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
 7361                                 ExternalAddress(vector_double_signflip()), new_mxcsr, $scratch$$Register, vlen_enc);
 7362   %}
 7363   ins_pipe( pipe_slow );
 7364 %}
 7365 #endif
 7366 // --------------------------------- VectorMaskCmp --------------------------------------
 7367 
 7368 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7369   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7370             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
 7371             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7372             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7373   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7374   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7375   ins_encode %{
 7376     int vlen_enc = vector_length_encoding(this, $src1);
 7377     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7378     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7379       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7380     } else {
 7381       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7382     }
 7383   %}
 7384   ins_pipe( pipe_slow );
 7385 %}
 7386 
 7387 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
 7388   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
 7389             n->bottom_type()->isa_vectmask() == NULL &&
 7390             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7391   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7392   effect(TEMP scratch, TEMP ktmp);
 7393   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
 7394   ins_encode %{
 7395     int vlen_enc = Assembler::AVX_512bit;
 7396     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7397     KRegister mask = k0; // The comparison itself is not being masked.
 7398     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7399       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7400       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
 7401     } else {
 7402       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7403       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
 7404     }
 7405   %}
 7406   ins_pipe( pipe_slow );
 7407 %}
 7408 
 7409 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
 7410   predicate(n->bottom_type()->isa_vectmask() &&
 7411             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
 7412   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7413   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
 7414   ins_encode %{
 7415     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7416     int vlen_enc = vector_length_encoding(this, $src1);
 7417     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 7418     KRegister mask = k0; // The comparison itself is not being masked.
 7419     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
 7420       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7421     } else {
 7422       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 7423     }
 7424   %}
 7425   ins_pipe( pipe_slow );
 7426 %}
 7427 
 7428 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
 7429   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7430             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7431             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7432             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7433             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7434             (n->in(2)->get_int() == BoolTest::eq ||
 7435              n->in(2)->get_int() == BoolTest::lt ||
 7436              n->in(2)->get_int() == BoolTest::gt)); // cond
 7437   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7438   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
 7439   ins_encode %{
 7440     int vlen_enc = vector_length_encoding(this, $src1);
 7441     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7442     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7443     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
 7444   %}
 7445   ins_pipe( pipe_slow );
 7446 %}
 7447 
 7448 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7449   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7450             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7451             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7452             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7453             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
 7454             (n->in(2)->get_int() == BoolTest::ne ||
 7455              n->in(2)->get_int() == BoolTest::le ||
 7456              n->in(2)->get_int() == BoolTest::ge)); // cond
 7457   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7458   effect(TEMP dst, TEMP xtmp);
 7459   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7460   ins_encode %{
 7461     int vlen_enc = vector_length_encoding(this, $src1);
 7462     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7463     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7464     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7465   %}
 7466   ins_pipe( pipe_slow );
 7467 %}
 7468 
 7469 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
 7470   predicate(n->bottom_type()->isa_vectmask() == NULL &&
 7471             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
 7472             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
 7473             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
 7474             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7475   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7476   effect(TEMP dst, TEMP xtmp);
 7477   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
 7478   ins_encode %{
 7479     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
 7480     int vlen_enc = vector_length_encoding(this, $src1);
 7481     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7482     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
 7483 
 7484     if (vlen_enc == Assembler::AVX_128bit) {
 7485       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7486     } else {
 7487       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
 7488     }
 7489     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
 7490     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 7491     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
 7492   %}
 7493   ins_pipe( pipe_slow );
 7494 %}
 7495 
 7496 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
 7497   predicate((n->bottom_type()->isa_vectmask() == NULL &&
 7498              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
 7499              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7500   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7501   effect(TEMP scratch, TEMP ktmp);
 7502   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
 7503   ins_encode %{
 7504     assert(UseAVX > 2, "required");
 7505 
 7506     int vlen_enc = vector_length_encoding(this, $src1);
 7507     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7508     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
 7509     KRegister mask = k0; // The comparison itself is not being masked.
 7510     bool merge = false;
 7511     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7512 
 7513     switch (src1_elem_bt) {
 7514       case T_INT: {
 7515         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7516         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
 7517         break;
 7518       }
 7519       case T_LONG: {
 7520         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7521         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
 7522         break;
 7523       }
 7524       default: assert(false, "%s", type2name(src1_elem_bt));
 7525     }
 7526   %}
 7527   ins_pipe( pipe_slow );
 7528 %}
 7529 
 7530 
 7531 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
 7532   predicate(n->bottom_type()->isa_vectmask() &&
 7533             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
 7534   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
 7535   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
 7536   ins_encode %{
 7537     assert(UseAVX > 2, "required");
 7538     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 7539 
 7540     int vlen_enc = vector_length_encoding(this, $src1);
 7541     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 7542     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
 7543     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 7544 
 7545     // Comparison i
 7546     switch (src1_elem_bt) {
 7547       case T_BYTE: {
 7548         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7549         break;
 7550       }
 7551       case T_SHORT: {
 7552         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7553         break;
 7554       }
 7555       case T_INT: {
 7556         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7557         break;
 7558       }
 7559       case T_LONG: {
 7560         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 7561         break;
 7562       }
 7563       default: assert(false, "%s", type2name(src1_elem_bt));
 7564     }
 7565   %}
 7566   ins_pipe( pipe_slow );
 7567 %}
 7568 
 7569 // Extract
 7570 
 7571 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
 7572   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
 7573   match(Set dst (ExtractI src idx));
 7574   match(Set dst (ExtractS src idx));
 7575 #ifdef _LP64
 7576   match(Set dst (ExtractB src idx));
 7577 #endif
 7578   format %{ "extractI $dst,$src,$idx\t!" %}
 7579   ins_encode %{
 7580     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7581 
 7582     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7583     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7584   %}
 7585   ins_pipe( pipe_slow );
 7586 %}
 7587 
 7588 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
 7589   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
 7590             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
 7591   match(Set dst (ExtractI src idx));
 7592   match(Set dst (ExtractS src idx));
 7593 #ifdef _LP64
 7594   match(Set dst (ExtractB src idx));
 7595 #endif
 7596   effect(TEMP vtmp);
 7597   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7598   ins_encode %{
 7599     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7600 
 7601     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
 7602     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7603     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
 7604   %}
 7605   ins_pipe( pipe_slow );
 7606 %}
 7607 
 7608 #ifdef _LP64
 7609 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
 7610   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
 7611   match(Set dst (ExtractL src idx));
 7612   format %{ "extractL $dst,$src,$idx\t!" %}
 7613   ins_encode %{
 7614     assert(UseSSE >= 4, "required");
 7615     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7616 
 7617     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
 7618   %}
 7619   ins_pipe( pipe_slow );
 7620 %}
 7621 
 7622 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
 7623   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7624             Matcher::vector_length(n->in(1)) == 8);  // src
 7625   match(Set dst (ExtractL src idx));
 7626   effect(TEMP vtmp);
 7627   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7628   ins_encode %{
 7629     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7630 
 7631     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7632     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
 7633   %}
 7634   ins_pipe( pipe_slow );
 7635 %}
 7636 #endif
 7637 
 7638 instruct extractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
 7639   predicate(Matcher::vector_length(n->in(1)) <= 4);
 7640   match(Set dst (ExtractF src idx));
 7641   effect(TEMP dst, TEMP tmp, TEMP vtmp);
 7642   format %{ "extractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
 7643   ins_encode %{
 7644     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7645 
 7646     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $tmp$$Register, $vtmp$$XMMRegister);
 7647   %}
 7648   ins_pipe( pipe_slow );
 7649 %}
 7650 
 7651 instruct vextractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
 7652   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
 7653             Matcher::vector_length(n->in(1)/*src*/) == 16);
 7654   match(Set dst (ExtractF src idx));
 7655   effect(TEMP tmp, TEMP vtmp);
 7656   format %{ "vextractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
 7657   ins_encode %{
 7658     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7659 
 7660     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7661     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant, $tmp$$Register);
 7662   %}
 7663   ins_pipe( pipe_slow );
 7664 %}
 7665 
 7666 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
 7667   predicate(Matcher::vector_length(n->in(1)) == 2); // src
 7668   match(Set dst (ExtractD src idx));
 7669   format %{ "extractD $dst,$src,$idx\t!" %}
 7670   ins_encode %{
 7671     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7672 
 7673     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7674   %}
 7675   ins_pipe( pipe_slow );
 7676 %}
 7677 
 7678 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
 7679   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
 7680             Matcher::vector_length(n->in(1)) == 8);  // src
 7681   match(Set dst (ExtractD src idx));
 7682   effect(TEMP vtmp);
 7683   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
 7684   ins_encode %{
 7685     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
 7686 
 7687     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
 7688     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
 7689   %}
 7690   ins_pipe( pipe_slow );
 7691 %}
 7692 
 7693 // --------------------------------- Vector Blend --------------------------------------
 7694 
 7695 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
 7696   predicate(UseAVX == 0);
 7697   match(Set dst (VectorBlend (Binary dst src) mask));
 7698   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
 7699   effect(TEMP tmp);
 7700   ins_encode %{
 7701     assert(UseSSE >= 4, "required");
 7702 
 7703     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
 7704       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
 7705     }
 7706     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
 7707   %}
 7708   ins_pipe( pipe_slow );
 7709 %}
 7710 
 7711 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7712   predicate(UseAVX > 0 &&
 7713             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
 7714             Matcher::vector_length_in_bytes(n) <= 32 &&
 7715             is_integral_type(Matcher::vector_element_basic_type(n)));
 7716   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7717   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7718   ins_encode %{
 7719     int vlen_enc = vector_length_encoding(this);
 7720     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7721   %}
 7722   ins_pipe( pipe_slow );
 7723 %}
 7724 
 7725 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
 7726   predicate(UseAVX > 0 &&
 7727             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
 7728             Matcher::vector_length_in_bytes(n) <= 32 &&
 7729             !is_integral_type(Matcher::vector_element_basic_type(n)));
 7730   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7731   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
 7732   ins_encode %{
 7733     int vlen_enc = vector_length_encoding(this);
 7734     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
 7735   %}
 7736   ins_pipe( pipe_slow );
 7737 %}
 7738 
 7739 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, rRegP scratch, kReg ktmp) %{
 7740   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
 7741             n->in(2)->bottom_type()->isa_vectmask() == NULL);
 7742   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7743   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
 7744   effect(TEMP scratch, TEMP ktmp);
 7745   ins_encode %{
 7746      int vlen_enc = Assembler::AVX_512bit;
 7747      BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7748     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, $scratch$$Register);
 7749     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7750   %}
 7751   ins_pipe( pipe_slow );
 7752 %}
 7753 
 7754 
 7755 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask, rRegP scratch) %{
 7756   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
 7757             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
 7758              VM_Version::supports_avx512bw()));
 7759   match(Set dst (VectorBlend (Binary src1 src2) mask));
 7760   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
 7761   effect(TEMP scratch);
 7762   ins_encode %{
 7763     int vlen_enc = vector_length_encoding(this);
 7764     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 7765     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 7766   %}
 7767   ins_pipe( pipe_slow );
 7768 %}
 7769 
 7770 // --------------------------------- ABS --------------------------------------
 7771 // a = |a|
 7772 instruct vabsB_reg(vec dst, vec src) %{
 7773   match(Set dst (AbsVB  src));
 7774   ins_cost(450);
 7775   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
 7776   ins_encode %{
 7777     uint vlen = Matcher::vector_length(this);
 7778     if (vlen <= 16) {
 7779       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 7780     } else {
 7781       int vlen_enc = vector_length_encoding(this);
 7782       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7783     }
 7784   %}
 7785   ins_pipe( pipe_slow );
 7786 %}
 7787 
 7788 instruct vabsS_reg(vec dst, vec src) %{
 7789   match(Set dst (AbsVS  src));
 7790   ins_cost(450);
 7791   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
 7792   ins_encode %{
 7793     uint vlen = Matcher::vector_length(this);
 7794     if (vlen <= 8) {
 7795       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 7796     } else {
 7797       int vlen_enc = vector_length_encoding(this);
 7798       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7799     }
 7800   %}
 7801   ins_pipe( pipe_slow );
 7802 %}
 7803 
 7804 instruct vabsI_reg(vec dst, vec src) %{
 7805   match(Set dst (AbsVI  src));
 7806   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
 7807   ins_cost(250);
 7808   ins_encode %{
 7809     uint vlen = Matcher::vector_length(this);
 7810     if (vlen <= 4) {
 7811       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 7812     } else {
 7813       int vlen_enc = vector_length_encoding(this);
 7814       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7815     }
 7816   %}
 7817   ins_pipe( pipe_slow );
 7818 %}
 7819 
 7820 instruct vabsL_reg(vec dst, vec src) %{
 7821   match(Set dst (AbsVL  src));
 7822   ins_cost(450);
 7823   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
 7824   ins_encode %{
 7825     assert(UseAVX > 2, "required");
 7826     int vlen_enc = vector_length_encoding(this);
 7827     if (!VM_Version::supports_avx512vl()) {
 7828       vlen_enc = Assembler::AVX_512bit;
 7829     }
 7830     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 7831   %}
 7832   ins_pipe( pipe_slow );
 7833 %}
 7834 
 7835 // --------------------------------- ABSNEG --------------------------------------
 7836 
 7837 instruct vabsnegF(vec dst, vec src, rRegI scratch) %{
 7838   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
 7839   match(Set dst (AbsVF src));
 7840   match(Set dst (NegVF src));
 7841   effect(TEMP scratch);
 7842   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
 7843   ins_cost(150);
 7844   ins_encode %{
 7845     int opcode = this->ideal_Opcode();
 7846     int vlen = Matcher::vector_length(this);
 7847     if (vlen == 2) {
 7848       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
 7849     } else {
 7850       assert(vlen == 8 || vlen == 16, "required");
 7851       int vlen_enc = vector_length_encoding(this);
 7852       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
 7853     }
 7854   %}
 7855   ins_pipe( pipe_slow );
 7856 %}
 7857 
 7858 instruct vabsneg4F(vec dst, rRegI scratch) %{
 7859   predicate(Matcher::vector_length(n) == 4);
 7860   match(Set dst (AbsVF dst));
 7861   match(Set dst (NegVF dst));
 7862   effect(TEMP scratch);
 7863   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
 7864   ins_cost(150);
 7865   ins_encode %{
 7866     int opcode = this->ideal_Opcode();
 7867     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $scratch$$Register);
 7868   %}
 7869   ins_pipe( pipe_slow );
 7870 %}
 7871 
 7872 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
 7873   match(Set dst (AbsVD  src));
 7874   match(Set dst (NegVD  src));
 7875   effect(TEMP scratch);
 7876   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
 7877   ins_encode %{
 7878     int opcode = this->ideal_Opcode();
 7879     uint vlen = Matcher::vector_length(this);
 7880     if (vlen == 2) {
 7881       assert(UseSSE >= 2, "required");
 7882       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
 7883     } else {
 7884       int vlen_enc = vector_length_encoding(this);
 7885       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
 7886     }
 7887   %}
 7888   ins_pipe( pipe_slow );
 7889 %}
 7890 
 7891 //------------------------------------- VectorTest --------------------------------------------
 7892 
 7893 #ifdef _LP64
 7894 instruct vptest_alltrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp1, legVec vtmp2, rFlagsReg cr) %{
 7895   predicate(!VM_Version::supports_avx512bwdq() &&
 7896             Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
 7897             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
 7898             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 7899   match(Set dst (VectorTest src1 src2 ));
 7900   effect(TEMP vtmp1, TEMP vtmp2, KILL cr);
 7901   format %{ "vptest_alltrue_lt16 $dst,$src1, $src2\t! using $vtmp1, $vtmp2 and $cr as TEMP" %}
 7902   ins_encode %{
 7903     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 7904     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
 7905     __ setb(Assembler::carrySet, $dst$$Register);
 7906     __ movzbl($dst$$Register, $dst$$Register);
 7907   %}
 7908   ins_pipe( pipe_slow );
 7909 %}
 7910 
 7911 instruct vptest_alltrue_ge16(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
 7912   predicate(!VM_Version::supports_avx512bwdq() &&
 7913             Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
 7914             Matcher::vector_length_in_bytes(n->in(1)) <  64 &&
 7915             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
 7916   match(Set dst (VectorTest src1 src2 ));
 7917   effect(KILL cr);
 7918   format %{ "vptest_alltrue_ge16  $dst,$src1, $src2\t! using $cr as TEMP" %}
 7919   ins_encode %{
 7920     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 7921     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
 7922     __ setb(Assembler::carrySet, $dst$$Register);
 7923     __ movzbl($dst$$Register, $dst$$Register);
 7924   %}
 7925   ins_pipe( pipe_slow );
 7926 %}
 7927 
 7928 instruct vptest_alltrue_lt8_evex(rRegI dst, kReg src1, kReg src2, kReg kscratch, rFlagsReg cr) %{
 7929   predicate(VM_Version::supports_avx512bwdq() &&
 7930             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow &&
 7931             n->in(1)->bottom_type()->isa_vectmask() &&
 7932             Matcher::vector_length(n->in(1)) < 8);
 7933   match(Set dst (VectorTest src1 src2));
 7934   effect(KILL cr, TEMP kscratch);
 7935   format %{ "vptest_alltrue_lt8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
 7936   ins_encode %{
 7937     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
 7938     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
 7939     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
 7940     uint masklen = Matcher::vector_length(this, $src1);
 7941     __ alltrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister, $kscratch$$KRegister);
 7942   %}
 7943   ins_pipe( pipe_slow );
 7944 %}
 7945 
 7946 
 7947 instruct vptest_alltrue_ge8_evex(rRegI dst, kReg src1, kReg src2, rFlagsReg cr) %{
 7948   predicate(VM_Version::supports_avx512bwdq() &&
 7949             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow &&
 7950             n->in(1)->bottom_type()->isa_vectmask() &&
 7951             Matcher::vector_length(n->in(1)) >= 8);
 7952   match(Set dst (VectorTest src1 src2));
 7953   effect(KILL cr);
 7954   format %{ "vptest_alltrue_ge8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
 7955   ins_encode %{
 7956     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
 7957     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
 7958     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
 7959     uint masklen = Matcher::vector_length(this, $src1);
 7960     __ alltrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister, knoreg);
 7961   %}
 7962   ins_pipe( pipe_slow );
 7963 %}
 7964 
 7965 
 7966 instruct vptest_anytrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp, rFlagsReg cr) %{
 7967   predicate(!VM_Version::supports_avx512bwdq() &&
 7968             Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
 7969             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
 7970             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 7971   match(Set dst (VectorTest src1 src2 ));
 7972   effect(TEMP vtmp, KILL cr);
 7973   format %{ "vptest_anytrue_lt16 $dst,$src1,$src2\t! using $vtmp, $cr as TEMP" %}
 7974   ins_encode %{
 7975     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 7976     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 7977     __ setb(Assembler::notZero, $dst$$Register);
 7978     __ movzbl($dst$$Register, $dst$$Register);
 7979   %}
 7980   ins_pipe( pipe_slow );
 7981 %}
 7982 
 7983 instruct vptest_anytrue_ge16(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
 7984   predicate(!VM_Version::supports_avx512bwdq() &&
 7985             Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
 7986             Matcher::vector_length_in_bytes(n->in(1)) < 64  &&
 7987             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 7988   match(Set dst (VectorTest src1 src2 ));
 7989   effect(KILL cr);
 7990   format %{ "vptest_anytrue_ge16 $dst,$src1,$src2\t! using $cr as TEMP" %}
 7991   ins_encode %{
 7992     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 7993     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
 7994     __ setb(Assembler::notZero, $dst$$Register);
 7995     __ movzbl($dst$$Register, $dst$$Register);
 7996   %}
 7997   ins_pipe( pipe_slow );
 7998 %}
 7999 
 8000 instruct vptest_anytrue_evex(rRegI dst, kReg src1, kReg src2, rFlagsReg cr) %{
 8001   predicate(VM_Version::supports_avx512bwdq() &&
 8002             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
 8003   match(Set dst (VectorTest src1 src2));
 8004   effect(KILL cr);
 8005   format %{ "vptest_anytrue_lt8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
 8006   ins_encode %{
 8007     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
 8008     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
 8009     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
 8010     uint  masklen = Matcher::vector_length(this, $src1);
 8011     __ anytrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister);
 8012   %}
 8013   ins_pipe( pipe_slow );
 8014 %}
 8015 
 8016 instruct cmpvptest_anytrue_lt16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero, legVec vtmp) %{
 8017   predicate(!VM_Version::supports_avx512bwdq() &&
 8018             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 4 &&
 8019             Matcher::vector_length_in_bytes(n->in(1)->in(1)) < 16 &&
 8020             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
 8021   match(Set cr (CmpI (VectorTest src1 src2) zero));
 8022   effect(TEMP vtmp);
 8023   format %{ "cmpvptest_anytrue_lt16 $src1,$src2\t! using $vtmp as TEMP" %}
 8024   ins_encode %{
 8025     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8026     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
 8027   %}
 8028   ins_pipe( pipe_slow );
 8029 %}
 8030 
 8031 instruct cmpvptest_anytrue_ge16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero) %{
 8032   predicate(!VM_Version::supports_avx512bwdq() &&
 8033             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 16 &&
 8034             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <  64 &&
 8035             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
 8036   match(Set cr (CmpI (VectorTest src1 src2) zero));
 8037   format %{ "cmpvptest_anytrue_ge16 $src1,$src2\t!" %}
 8038   ins_encode %{
 8039     int vlen = Matcher::vector_length_in_bytes(this, $src1);
 8040     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
 8041   %}
 8042   ins_pipe( pipe_slow );
 8043 %}
 8044 
 8045 instruct cmpvptest_anytrue_evex(rFlagsReg cr, kReg src1, kReg src2, immI_0 zero) %{
 8046   predicate(VM_Version::supports_avx512bwdq() &&
 8047             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
 8048   match(Set cr (CmpI (VectorTest src1 src2) zero));
 8049   format %{ "cmpvptest_anytrue_evex $src1,$src2\t!" %}
 8050   ins_encode %{
 8051     uint masklen = Matcher::vector_length(this, $src1);
 8052     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
 8053     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
 8054     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
 8055     masklen = masklen < 8 ? 8 : masklen;
 8056     __ ktest(masklen, $src1$$KRegister, $src2$$KRegister);
 8057   %}
 8058   ins_pipe( pipe_slow );
 8059 %}
 8060 #endif
 8061 
 8062 //------------------------------------- LoadMask --------------------------------------------
 8063 
 8064 instruct loadMask(legVec dst, legVec src) %{
 8065   predicate(n->bottom_type()->isa_vectmask() == NULL && !VM_Version::supports_avx512vlbw());
 8066   match(Set dst (VectorLoadMask src));
 8067   effect(TEMP dst);
 8068   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
 8069   ins_encode %{
 8070     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8071     BasicType elem_bt = Matcher::vector_element_basic_type(this);
 8072     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
 8073   %}
 8074   ins_pipe( pipe_slow );
 8075 %}
 8076 
 8077 instruct loadMask64(kReg dst, vec src, vec xtmp, rRegI tmp) %{
 8078   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8079   match(Set dst (VectorLoadMask src));
 8080   effect(TEMP xtmp, TEMP tmp);
 8081   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp and $tmp as TEMP" %}
 8082   ins_encode %{
 8083     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8084                         $tmp$$Register, true, Assembler::AVX_512bit);
 8085   %}
 8086   ins_pipe( pipe_slow );
 8087 %}
 8088 
 8089 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
 8090   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8091   match(Set dst (VectorLoadMask src));
 8092   effect(TEMP xtmp);
 8093   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
 8094   ins_encode %{
 8095     int vlen_enc = vector_length_encoding(in(1));
 8096     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 8097                         noreg, false, vlen_enc);
 8098   %}
 8099   ins_pipe( pipe_slow );
 8100 %}
 8101 
 8102 //------------------------------------- StoreMask --------------------------------------------
 8103 
 8104 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
 8105   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8106   match(Set dst (VectorStoreMask src size));
 8107   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8108   ins_encode %{
 8109     int vlen = Matcher::vector_length(this);
 8110     if (vlen <= 16 && UseAVX <= 2) {
 8111       assert(UseSSE >= 3, "required");
 8112       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
 8113     } else {
 8114       assert(UseAVX > 0, "required");
 8115       int src_vlen_enc = vector_length_encoding(this, $src);
 8116       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8117     }
 8118   %}
 8119   ins_pipe( pipe_slow );
 8120 %}
 8121 
 8122 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
 8123   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8124   match(Set dst (VectorStoreMask src size));
 8125   effect(TEMP_DEF dst, TEMP xtmp);
 8126   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8127   ins_encode %{
 8128     int vlen_enc = Assembler::AVX_128bit;
 8129     int vlen = Matcher::vector_length(this);
 8130     if (vlen <= 8) {
 8131       assert(UseSSE >= 3, "required");
 8132       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8133       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
 8134       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8135     } else {
 8136       assert(UseAVX > 0, "required");
 8137       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8138       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8139       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8140     }
 8141   %}
 8142   ins_pipe( pipe_slow );
 8143 %}
 8144 
 8145 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
 8146   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8147   match(Set dst (VectorStoreMask src size));
 8148   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8149   effect(TEMP_DEF dst, TEMP xtmp);
 8150   ins_encode %{
 8151     int vlen_enc = Assembler::AVX_128bit;
 8152     int vlen = Matcher::vector_length(this);
 8153     if (vlen <= 4) {
 8154       assert(UseSSE >= 3, "required");
 8155       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8156       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
 8157       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8158       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8159     } else {
 8160       assert(UseAVX > 0, "required");
 8161       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8162       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
 8163       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8164       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
 8165       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8166     }
 8167   %}
 8168   ins_pipe( pipe_slow );
 8169 %}
 8170 
 8171 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
 8172   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
 8173   match(Set dst (VectorStoreMask src size));
 8174   effect(TEMP_DEF dst, TEMP xtmp);
 8175   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8176   ins_encode %{
 8177     assert(UseSSE >= 3, "required");
 8178     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
 8179     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
 8180     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
 8181     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
 8182     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
 8183   %}
 8184   ins_pipe( pipe_slow );
 8185 %}
 8186 
 8187 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
 8188   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
 8189   match(Set dst (VectorStoreMask src size));
 8190   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
 8191   effect(TEMP_DEF dst, TEMP vtmp);
 8192   ins_encode %{
 8193     int vlen_enc = Assembler::AVX_128bit;
 8194     __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
 8195     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
 8196     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
 8197     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8198     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8199     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8200     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8201   %}
 8202   ins_pipe( pipe_slow );
 8203 %}
 8204 
 8205 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
 8206   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8207   match(Set dst (VectorStoreMask src size));
 8208   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8209   ins_encode %{
 8210     int src_vlen_enc = vector_length_encoding(this, $src);
 8211     int dst_vlen_enc = vector_length_encoding(this);
 8212     if (!VM_Version::supports_avx512vl()) {
 8213       src_vlen_enc = Assembler::AVX_512bit;
 8214     }
 8215     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8216     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8217   %}
 8218   ins_pipe( pipe_slow );
 8219 %}
 8220 
 8221 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
 8222   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
 8223   match(Set dst (VectorStoreMask src size));
 8224   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
 8225   ins_encode %{
 8226     int src_vlen_enc = vector_length_encoding(this, $src);
 8227     int dst_vlen_enc = vector_length_encoding(this);
 8228     if (!VM_Version::supports_avx512vl()) {
 8229       src_vlen_enc = Assembler::AVX_512bit;
 8230     }
 8231     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
 8232     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8233   %}
 8234   ins_pipe( pipe_slow );
 8235 %}
 8236 
 8237 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size, rRegI tmp) %{
 8238   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
 8239   match(Set dst (VectorStoreMask mask size));
 8240   effect(TEMP_DEF dst, TEMP tmp);
 8241   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8242   ins_encode %{
 8243     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
 8244     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
 8245                  false, Assembler::AVX_512bit, $tmp$$Register);
 8246     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
 8247   %}
 8248   ins_pipe( pipe_slow );
 8249 %}
 8250 
 8251 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
 8252   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
 8253   match(Set dst (VectorStoreMask mask size));
 8254   effect(TEMP_DEF dst);
 8255   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
 8256   ins_encode %{
 8257     int dst_vlen_enc = vector_length_encoding(this);
 8258     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
 8259     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
 8260   %}
 8261   ins_pipe( pipe_slow );
 8262 %}
 8263 
 8264 instruct vmaskcast_evex(kReg dst) %{
 8265   predicate(Matcher::vector_length(n) == Matcher::vector_length(n->in(1)));
 8266   match(Set dst (VectorMaskCast dst));
 8267   ins_cost(0);
 8268   format %{ "vector_mask_cast $dst" %}
 8269   ins_encode %{
 8270     // empty
 8271   %}
 8272   ins_pipe(empty);
 8273 %}
 8274 
 8275 instruct vmaskcast(vec dst) %{
 8276   predicate((Matcher::vector_length(n) == Matcher::vector_length(n->in(1))) &&
 8277             (Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))));
 8278   match(Set dst (VectorMaskCast dst));
 8279   ins_cost(0);
 8280   format %{ "vector_mask_cast $dst" %}
 8281   ins_encode %{
 8282     // empty
 8283   %}
 8284   ins_pipe(empty);
 8285 %}
 8286 
 8287 //-------------------------------- Load Iota Indices ----------------------------------
 8288 
 8289 instruct loadIotaIndices(vec dst, immI_0 src, rRegP scratch) %{
 8290   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 8291   match(Set dst (VectorLoadConst src));
 8292   effect(TEMP scratch);
 8293   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
 8294   ins_encode %{
 8295      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8296      __ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen_in_bytes);
 8297   %}
 8298   ins_pipe( pipe_slow );
 8299 %}
 8300 
 8301 //-------------------------------- Rearrange ----------------------------------
 8302 
 8303 // LoadShuffle/Rearrange for Byte
 8304 
 8305 instruct loadShuffleB(vec dst) %{
 8306   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
 8307   match(Set dst (VectorLoadShuffle dst));
 8308   format %{ "vector_load_shuffle $dst, $dst" %}
 8309   ins_encode %{
 8310     // empty
 8311   %}
 8312   ins_pipe( pipe_slow );
 8313 %}
 8314 
 8315 instruct rearrangeB(vec dst, vec shuffle) %{
 8316   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8317             Matcher::vector_length(n) < 32);
 8318   match(Set dst (VectorRearrange dst shuffle));
 8319   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8320   ins_encode %{
 8321     assert(UseSSE >= 4, "required");
 8322     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8323   %}
 8324   ins_pipe( pipe_slow );
 8325 %}
 8326 
 8327 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
 8328   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8329             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
 8330   match(Set dst (VectorRearrange src shuffle));
 8331   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
 8332   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
 8333   ins_encode %{
 8334     assert(UseAVX >= 2, "required");
 8335     // Swap src into vtmp1
 8336     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8337     // Shuffle swapped src to get entries from other 128 bit lane
 8338     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8339     // Shuffle original src to get entries from self 128 bit lane
 8340     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8341     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8342     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
 8343     // Perform the blend
 8344     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8345   %}
 8346   ins_pipe( pipe_slow );
 8347 %}
 8348 
 8349 instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{
 8350   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
 8351             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
 8352   match(Set dst (VectorRearrange src shuffle));
 8353   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8354   ins_encode %{
 8355     int vlen_enc = vector_length_encoding(this);
 8356     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8357   %}
 8358   ins_pipe( pipe_slow );
 8359 %}
 8360 
 8361 // LoadShuffle/Rearrange for Short
 8362 
 8363 instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{
 8364   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8365             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
 8366   match(Set dst (VectorLoadShuffle src));
 8367   effect(TEMP dst, TEMP vtmp, TEMP scratch);
 8368   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
 8369   ins_encode %{
 8370     // Create a byte shuffle mask from short shuffle mask
 8371     // only byte shuffle instruction available on these platforms
 8372     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
 8373     if (UseAVX == 0) {
 8374       assert(vlen_in_bytes <= 16, "required");
 8375       // Multiply each shuffle by two to get byte index
 8376       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
 8377       __ psllw($vtmp$$XMMRegister, 1);
 8378 
 8379       // Duplicate to create 2 copies of byte index
 8380       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8381       __ psllw($dst$$XMMRegister, 8);
 8382       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
 8383 
 8384       // Add one to get alternate byte index
 8385       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
 8386       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8387     } else {
 8388       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
 8389       int vlen_enc = vector_length_encoding(this);
 8390       // Multiply each shuffle by two to get byte index
 8391       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8392       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8393 
 8394       // Duplicate to create 2 copies of byte index
 8395       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
 8396       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8397 
 8398       // Add one to get alternate byte index
 8399       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, $scratch$$Register);
 8400     }
 8401   %}
 8402   ins_pipe( pipe_slow );
 8403 %}
 8404 
 8405 instruct rearrangeS(vec dst, vec shuffle) %{
 8406   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8407             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
 8408   match(Set dst (VectorRearrange dst shuffle));
 8409   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8410   ins_encode %{
 8411     assert(UseSSE >= 4, "required");
 8412     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8413   %}
 8414   ins_pipe( pipe_slow );
 8415 %}
 8416 
 8417 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
 8418   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8419             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
 8420   match(Set dst (VectorRearrange src shuffle));
 8421   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
 8422   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
 8423   ins_encode %{
 8424     assert(UseAVX >= 2, "required");
 8425     // Swap src into vtmp1
 8426     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
 8427     // Shuffle swapped src to get entries from other 128 bit lane
 8428     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8429     // Shuffle original src to get entries from self 128 bit lane
 8430     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
 8431     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
 8432     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
 8433     // Perform the blend
 8434     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
 8435   %}
 8436   ins_pipe( pipe_slow );
 8437 %}
 8438 
 8439 instruct loadShuffleS_evex(vec dst, vec src) %{
 8440   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8441             VM_Version::supports_avx512bw());
 8442   match(Set dst (VectorLoadShuffle src));
 8443   format %{ "vector_load_shuffle $dst, $src" %}
 8444   ins_encode %{
 8445     int vlen_enc = vector_length_encoding(this);
 8446     if (!VM_Version::supports_avx512vl()) {
 8447       vlen_enc = Assembler::AVX_512bit;
 8448     }
 8449     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8450   %}
 8451   ins_pipe( pipe_slow );
 8452 %}
 8453 
 8454 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
 8455   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
 8456             VM_Version::supports_avx512bw());
 8457   match(Set dst (VectorRearrange src shuffle));
 8458   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8459   ins_encode %{
 8460     int vlen_enc = vector_length_encoding(this);
 8461     if (!VM_Version::supports_avx512vl()) {
 8462       vlen_enc = Assembler::AVX_512bit;
 8463     }
 8464     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8465   %}
 8466   ins_pipe( pipe_slow );
 8467 %}
 8468 
 8469 // LoadShuffle/Rearrange for Integer and Float
 8470 
 8471 instruct loadShuffleI(vec dst, vec src, vec vtmp, rRegP scratch) %{
 8472   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8473             Matcher::vector_length(n) == 4 && UseAVX < 2);
 8474   match(Set dst (VectorLoadShuffle src));
 8475   effect(TEMP dst, TEMP vtmp, TEMP scratch);
 8476   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
 8477   ins_encode %{
 8478     assert(UseSSE >= 4, "required");
 8479 
 8480     // Create a byte shuffle mask from int shuffle mask
 8481     // only byte shuffle instruction available on these platforms
 8482 
 8483     // Duplicate and multiply each shuffle by 4
 8484     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
 8485     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8486     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
 8487     __ psllw($vtmp$$XMMRegister, 2);
 8488 
 8489     // Duplicate again to create 4 copies of byte index
 8490     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
 8491     __ psllw($dst$$XMMRegister, 8);
 8492     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
 8493 
 8494     // Add 3,2,1,0 to get alternate byte index
 8495     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), $scratch$$Register);
 8496     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
 8497   %}
 8498   ins_pipe( pipe_slow );
 8499 %}
 8500 
 8501 instruct rearrangeI(vec dst, vec shuffle) %{
 8502  predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8503            Matcher::vector_length(n) == 4 && UseAVX < 2);
 8504   match(Set dst (VectorRearrange dst shuffle));
 8505   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
 8506   ins_encode %{
 8507     assert(UseSSE >= 4, "required");
 8508     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
 8509   %}
 8510   ins_pipe( pipe_slow );
 8511 %}
 8512 
 8513 instruct loadShuffleI_avx(vec dst, vec src) %{
 8514   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8515             UseAVX >= 2);
 8516   match(Set dst (VectorLoadShuffle src));
 8517   format %{ "vector_load_shuffle $dst, $src" %}
 8518   ins_encode %{
 8519   int vlen_enc = vector_length_encoding(this);
 8520     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8521   %}
 8522   ins_pipe( pipe_slow );
 8523 %}
 8524 
 8525 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
 8526   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
 8527             UseAVX >= 2);
 8528   match(Set dst (VectorRearrange src shuffle));
 8529   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8530   ins_encode %{
 8531     int vlen_enc = vector_length_encoding(this);
 8532     if (vlen_enc == Assembler::AVX_128bit) {
 8533       vlen_enc = Assembler::AVX_256bit;
 8534     }
 8535     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8536   %}
 8537   ins_pipe( pipe_slow );
 8538 %}
 8539 
 8540 // LoadShuffle/Rearrange for Long and Double
 8541 
 8542 instruct loadShuffleL(vec dst, vec src, vec vtmp, rRegP scratch) %{
 8543   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8544             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8545   match(Set dst (VectorLoadShuffle src));
 8546   effect(TEMP dst, TEMP vtmp, TEMP scratch);
 8547   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
 8548   ins_encode %{
 8549     assert(UseAVX >= 2, "required");
 8550 
 8551     int vlen_enc = vector_length_encoding(this);
 8552     // Create a double word shuffle mask from long shuffle mask
 8553     // only double word shuffle instruction available on these platforms
 8554 
 8555     // Multiply each shuffle by two to get double word index
 8556     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8557     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
 8558 
 8559     // Duplicate each double word shuffle
 8560     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
 8561     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
 8562 
 8563     // Add one to get alternate double word index
 8564     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, $scratch$$Register);
 8565   %}
 8566   ins_pipe( pipe_slow );
 8567 %}
 8568 
 8569 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
 8570   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8571             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
 8572   match(Set dst (VectorRearrange src shuffle));
 8573   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8574   ins_encode %{
 8575     assert(UseAVX >= 2, "required");
 8576 
 8577     int vlen_enc = vector_length_encoding(this);
 8578     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8579   %}
 8580   ins_pipe( pipe_slow );
 8581 %}
 8582 
 8583 instruct loadShuffleL_evex(vec dst, vec src) %{
 8584   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8585             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8586   match(Set dst (VectorLoadShuffle src));
 8587   format %{ "vector_load_shuffle $dst, $src" %}
 8588   ins_encode %{
 8589     assert(UseAVX > 2, "required");
 8590 
 8591     int vlen_enc = vector_length_encoding(this);
 8592     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8593   %}
 8594   ins_pipe( pipe_slow );
 8595 %}
 8596 
 8597 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
 8598   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
 8599             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
 8600   match(Set dst (VectorRearrange src shuffle));
 8601   format %{ "vector_rearrange $dst, $shuffle, $src" %}
 8602   ins_encode %{
 8603     assert(UseAVX > 2, "required");
 8604 
 8605     int vlen_enc = vector_length_encoding(this);
 8606     if (vlen_enc == Assembler::AVX_128bit) {
 8607       vlen_enc = Assembler::AVX_256bit;
 8608     }
 8609     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8610   %}
 8611   ins_pipe( pipe_slow );
 8612 %}
 8613 
 8614 // --------------------------------- FMA --------------------------------------
 8615 // a * b + c
 8616 
 8617 instruct vfmaF_reg(vec a, vec b, vec c) %{
 8618   match(Set c (FmaVF  c (Binary a b)));
 8619   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8620   ins_cost(150);
 8621   ins_encode %{
 8622     assert(UseFMA, "not enabled");
 8623     int vlen_enc = vector_length_encoding(this);
 8624     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8625   %}
 8626   ins_pipe( pipe_slow );
 8627 %}
 8628 
 8629 instruct vfmaF_mem(vec a, memory b, vec c) %{
 8630   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8631   match(Set c (FmaVF  c (Binary a (LoadVector b))));
 8632   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
 8633   ins_cost(150);
 8634   ins_encode %{
 8635     assert(UseFMA, "not enabled");
 8636     int vlen_enc = vector_length_encoding(this);
 8637     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8638   %}
 8639   ins_pipe( pipe_slow );
 8640 %}
 8641 
 8642 instruct vfmaD_reg(vec a, vec b, vec c) %{
 8643   match(Set c (FmaVD  c (Binary a b)));
 8644   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8645   ins_cost(150);
 8646   ins_encode %{
 8647     assert(UseFMA, "not enabled");
 8648     int vlen_enc = vector_length_encoding(this);
 8649     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
 8650   %}
 8651   ins_pipe( pipe_slow );
 8652 %}
 8653 
 8654 instruct vfmaD_mem(vec a, memory b, vec c) %{
 8655   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
 8656   match(Set c (FmaVD  c (Binary a (LoadVector b))));
 8657   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
 8658   ins_cost(150);
 8659   ins_encode %{
 8660     assert(UseFMA, "not enabled");
 8661     int vlen_enc = vector_length_encoding(this);
 8662     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
 8663   %}
 8664   ins_pipe( pipe_slow );
 8665 %}
 8666 
 8667 // --------------------------------- Vector Multiply Add --------------------------------------
 8668 
 8669 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
 8670   predicate(UseAVX == 0);
 8671   match(Set dst (MulAddVS2VI dst src1));
 8672   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
 8673   ins_encode %{
 8674     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
 8675   %}
 8676   ins_pipe( pipe_slow );
 8677 %}
 8678 
 8679 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
 8680   predicate(UseAVX > 0);
 8681   match(Set dst (MulAddVS2VI src1 src2));
 8682   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
 8683   ins_encode %{
 8684     int vlen_enc = vector_length_encoding(this);
 8685     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8686   %}
 8687   ins_pipe( pipe_slow );
 8688 %}
 8689 
 8690 // --------------------------------- Vector Multiply Add Add ----------------------------------
 8691 
 8692 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
 8693   predicate(VM_Version::supports_avx512_vnni());
 8694   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
 8695   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
 8696   ins_encode %{
 8697     assert(UseAVX > 2, "required");
 8698     int vlen_enc = vector_length_encoding(this);
 8699     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
 8700   %}
 8701   ins_pipe( pipe_slow );
 8702   ins_cost(10);
 8703 %}
 8704 
 8705 // --------------------------------- PopCount --------------------------------------
 8706 
 8707 instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
 8708   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8709   match(Set dst (PopCountVI src));
 8710   match(Set dst (PopCountVL src));
 8711   ins_cost(400);
 8712   format %{ "vector_popcount_integral $dst, $src" %}
 8713   ins_encode %{
 8714     int opcode = this->ideal_Opcode();
 8715     int vlen_enc = vector_length_encoding(this, $src);
 8716     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8717     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
 8718     // TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
 8719     // should be succeeded by its corresponding vector IR and following
 8720     // special handling should be removed.
 8721     if (opcode == Op_PopCountVL && Matcher::vector_element_basic_type(this) == T_INT) {
 8722       __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8723     }
 8724   %}
 8725   ins_pipe( pipe_slow );
 8726 %}
 8727 
 8728 instruct vpopcount_integral_reg_evex_masked(vec dst, vec src, kReg mask) %{
 8729   predicate(is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8730   match(Set dst (PopCountVI src mask));
 8731   match(Set dst (PopCountVL src mask));
 8732   format %{ "vector_popcount_integral_masked $dst, $src, $mask" %}
 8733   ins_encode %{
 8734     int vlen_enc = vector_length_encoding(this, $src);
 8735     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8736     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 8737     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, true, vlen_enc);
 8738   %}
 8739   ins_pipe( pipe_slow );
 8740 %}
 8741 
 8742 instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %{
 8743   predicate(!is_vector_popcount_predicate(Matcher::vector_element_basic_type(n->in(1))));
 8744   match(Set dst (PopCountVI src));
 8745   match(Set dst (PopCountVL src));
 8746   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 8747   format %{ "vector_popcount_integral $dst, $src\t! using $xtmp1, $xtmp2, and $rtmp as TEMP" %}
 8748   ins_encode %{
 8749     int opcode = this->ideal_Opcode();
 8750     int vlen_enc = vector_length_encoding(this, $src);
 8751     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8752     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8753                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
 8754     // TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
 8755     // should be succeeded by its corresponding vector IR and following
 8756     // special handling should be removed.
 8757     if (opcode == Op_PopCountVL && Matcher::vector_element_basic_type(this) == T_INT) {
 8758       if (VM_Version::supports_avx512vl()) {
 8759         __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8760       } else {
 8761         assert(VM_Version::supports_avx2(), "");
 8762         __ vpshufd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 8763         __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 8764       }
 8765     }
 8766   %}
 8767   ins_pipe( pipe_slow );
 8768 %}
 8769 
 8770 // --------------------------------- Vector Trailing Zeros Count --------------------------------------
 8771 
 8772 instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp) %{
 8773   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 8774                                               Matcher::vector_length_in_bytes(n->in(1))));
 8775   match(Set dst (CountTrailingZerosV src));
 8776   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 8777   ins_cost(400);
 8778   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp and $rtmp as TEMP" %}
 8779   ins_encode %{
 8780     int vlen_enc = vector_length_encoding(this, $src);
 8781     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8782     BasicType rbt = Matcher::vector_element_basic_type(this);
 8783     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 8784                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8785     // TODO: Once auto-vectorizer supports ConvL2I operation, CountTrailingZerosV
 8786     // should be succeeded by its corresponding vector IR and following
 8787     // special handling should be removed.
 8788     if (bt == T_LONG && rbt == T_INT) {
 8789       __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8790     }
 8791   %}
 8792   ins_pipe( pipe_slow );
 8793 %}
 8794 
 8795 instruct vcount_trailing_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 8796   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 8797             VM_Version::supports_avx512cd() &&
 8798             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 8799   match(Set dst (CountTrailingZerosV src));
 8800   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 8801   ins_cost(400);
 8802   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3 and $rtmp as TEMP" %}
 8803   ins_encode %{
 8804     int vlen_enc = vector_length_encoding(this, $src);
 8805     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8806     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8807                                         $xtmp2$$XMMRegister, xnoreg, $xtmp3$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
 8808   %}
 8809   ins_pipe( pipe_slow );
 8810 %}
 8811 
 8812 instruct vcount_trailing_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, kReg ktmp, rRegP rtmp) %{
 8813   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 8814   match(Set dst (CountTrailingZerosV src));
 8815   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP ktmp, TEMP rtmp);
 8816   ins_cost(400);
 8817   format %{ "vector_count_trailing_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $xtmp4, $ktmp and $rtmp as TEMP" %}
 8818   ins_encode %{
 8819     int vlen_enc = vector_length_encoding(this, $src);
 8820     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8821     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8822                                         $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
 8823                                         $ktmp$$KRegister, $rtmp$$Register, vlen_enc);
 8824   %}
 8825   ins_pipe( pipe_slow );
 8826 %}
 8827 
 8828 instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 8829   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 8830   match(Set dst (CountTrailingZerosV src));
 8831   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 8832   format %{ "vector_count_trailing_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 8833   ins_encode %{
 8834     int vlen_enc = vector_length_encoding(this, $src);
 8835     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 8836     BasicType rbt = Matcher::vector_element_basic_type(this);
 8837     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 8838                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 8839     // TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
 8840     // should be succeeded by its corresponding vector IR and following
 8841     // special handling should be removed.
 8842     if (bt == T_LONG && rbt == T_INT) {
 8843       if (VM_Version::supports_avx512vl()) {
 8844         __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 8845       } else {
 8846         assert(VM_Version::supports_avx2(), "");
 8847         __ vpshufd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 8848         __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
 8849       }
 8850     }
 8851   %}
 8852   ins_pipe( pipe_slow );
 8853 %}
 8854 
 8855 
 8856 // --------------------------------- Bitwise Ternary Logic ----------------------------------
 8857 
 8858 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
 8859   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
 8860   effect(TEMP dst);
 8861   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 8862   ins_encode %{
 8863     int vector_len = vector_length_encoding(this);
 8864     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
 8865   %}
 8866   ins_pipe( pipe_slow );
 8867 %}
 8868 
 8869 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
 8870   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
 8871   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
 8872   effect(TEMP dst);
 8873   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
 8874   ins_encode %{
 8875     int vector_len = vector_length_encoding(this);
 8876     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
 8877   %}
 8878   ins_pipe( pipe_slow );
 8879 %}
 8880 
 8881 // --------------------------------- Rotation Operations ----------------------------------
 8882 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
 8883   match(Set dst (RotateLeftV src shift));
 8884   match(Set dst (RotateRightV src shift));
 8885   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
 8886   ins_encode %{
 8887     int opcode      = this->ideal_Opcode();
 8888     int vector_len  = vector_length_encoding(this);
 8889     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 8890     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
 8891   %}
 8892   ins_pipe( pipe_slow );
 8893 %}
 8894 
 8895 instruct vprorate(vec dst, vec src, vec shift) %{
 8896   match(Set dst (RotateLeftV src shift));
 8897   match(Set dst (RotateRightV src shift));
 8898   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
 8899   ins_encode %{
 8900     int opcode      = this->ideal_Opcode();
 8901     int vector_len  = vector_length_encoding(this);
 8902     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
 8903     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
 8904   %}
 8905   ins_pipe( pipe_slow );
 8906 %}
 8907 
 8908 #ifdef _LP64
 8909 // ---------------------------------- Masked Operations ------------------------------------
 8910 
 8911 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
 8912   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
 8913   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
 8914   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
 8915   ins_encode %{
 8916     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
 8917     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
 8918 
 8919     Label DONE;
 8920     int vlen_enc = vector_length_encoding(this, $src1);
 8921     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
 8922 
 8923     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
 8924     __ mov64($dst$$Register, -1L);
 8925     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
 8926     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
 8927     __ jccb(Assembler::carrySet, DONE);
 8928     __ kmovql($dst$$Register, $ktmp1$$KRegister);
 8929     __ notq($dst$$Register);
 8930     __ tzcntq($dst$$Register, $dst$$Register);
 8931     __ bind(DONE);
 8932   %}
 8933   ins_pipe( pipe_slow );
 8934 %}
 8935 
 8936 
 8937 instruct vmasked_load64(vec dst, memory mem, kReg mask) %{
 8938   match(Set dst (LoadVectorMasked mem mask));
 8939   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
 8940   ins_encode %{
 8941     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
 8942     int vector_len = vector_length_encoding(this);
 8943     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, vector_len);
 8944   %}
 8945   ins_pipe( pipe_slow );
 8946 %}
 8947 
 8948 instruct vmask_gen(kReg dst, rRegL len, rRegL temp) %{
 8949   match(Set dst (VectorMaskGen len));
 8950   effect(TEMP temp);
 8951   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
 8952   ins_encode %{
 8953     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
 8954   %}
 8955   ins_pipe( pipe_slow );
 8956 %}
 8957 
 8958 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
 8959   match(Set dst (VectorMaskGen len));
 8960   format %{ "vector_mask_gen $len \t! vector mask generator" %}
 8961   effect(TEMP temp);
 8962   ins_encode %{
 8963     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
 8964     __ kmovql($dst$$KRegister, $temp$$Register);
 8965   %}
 8966   ins_pipe( pipe_slow );
 8967 %}
 8968 
 8969 instruct vmasked_store64(memory mem, vec src, kReg mask) %{
 8970   match(Set mem (StoreVectorMasked mem (Binary src mask)));
 8971   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
 8972   ins_encode %{
 8973     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
 8974     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
 8975     int vector_len = vector_length_encoding(src_node);
 8976     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, vector_len);
 8977   %}
 8978   ins_pipe( pipe_slow );
 8979 %}
 8980 
 8981 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
 8982   predicate(n->in(1)->bottom_type()->isa_vectmask());
 8983   match(Set dst (VectorMaskToLong mask));
 8984   effect(TEMP dst, KILL cr);
 8985   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
 8986   ins_encode %{
 8987     int opcode = this->ideal_Opcode();
 8988     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 8989     int mask_len = Matcher::vector_length(this, $mask);
 8990     int mask_size = mask_len * type2aelembytes(mbt);
 8991     int vlen_enc = vector_length_encoding(this, $mask);
 8992     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 8993                              $dst$$Register, mask_len, mask_size, vlen_enc);
 8994   %}
 8995   ins_pipe( pipe_slow );
 8996 %}
 8997 
 8998 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
 8999   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
 9000   match(Set dst (VectorMaskToLong mask));
 9001   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
 9002   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9003   ins_encode %{
 9004     int opcode = this->ideal_Opcode();
 9005     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9006     int mask_len = Matcher::vector_length(this, $mask);
 9007     int vlen_enc = vector_length_encoding(this, $mask);
 9008     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9009                              $dst$$Register, mask_len, mbt, vlen_enc);
 9010   %}
 9011   ins_pipe( pipe_slow );
 9012 %}
 9013 
 9014 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
 9015   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
 9016   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
 9017   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
 9018   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
 9019   ins_encode %{
 9020     int opcode = this->ideal_Opcode();
 9021     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9022     int mask_len = Matcher::vector_length(this, $mask);
 9023     int vlen_enc = vector_length_encoding(this, $mask);
 9024     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9025                              $dst$$Register, mask_len, mbt, vlen_enc);
 9026   %}
 9027   ins_pipe( pipe_slow );
 9028 %}
 9029 
 9030 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9031   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9032   match(Set dst (VectorMaskTrueCount mask));
 9033   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9034   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
 9035   ins_encode %{
 9036     int opcode = this->ideal_Opcode();
 9037     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9038     int mask_len = Matcher::vector_length(this, $mask);
 9039     int mask_size = mask_len * type2aelembytes(mbt);
 9040     int vlen_enc = vector_length_encoding(this, $mask);
 9041     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9042                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9043   %}
 9044   ins_pipe( pipe_slow );
 9045 %}
 9046 
 9047 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9048   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
 9049   match(Set dst (VectorMaskTrueCount mask));
 9050   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9051   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9052   ins_encode %{
 9053     int opcode = this->ideal_Opcode();
 9054     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9055     int mask_len = Matcher::vector_length(this, $mask);
 9056     int vlen_enc = vector_length_encoding(this, $mask);
 9057     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9058                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9059   %}
 9060   ins_pipe( pipe_slow );
 9061 %}
 9062 
 9063 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9064   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
 9065   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
 9066   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9067   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9068   ins_encode %{
 9069     int opcode = this->ideal_Opcode();
 9070     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9071     int mask_len = Matcher::vector_length(this, $mask);
 9072     int vlen_enc = vector_length_encoding(this, $mask);
 9073     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9074                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9075   %}
 9076   ins_pipe( pipe_slow );
 9077 %}
 9078 
 9079 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
 9080   predicate(n->in(1)->bottom_type()->isa_vectmask());
 9081   match(Set dst (VectorMaskFirstTrue mask));
 9082   match(Set dst (VectorMaskLastTrue mask));
 9083   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
 9084   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
 9085   ins_encode %{
 9086     int opcode = this->ideal_Opcode();
 9087     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9088     int mask_len = Matcher::vector_length(this, $mask);
 9089     int mask_size = mask_len * type2aelembytes(mbt);
 9090     int vlen_enc = vector_length_encoding(this, $mask);
 9091     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
 9092                              $tmp$$Register, mask_len, mask_size, vlen_enc);
 9093   %}
 9094   ins_pipe( pipe_slow );
 9095 %}
 9096 
 9097 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9098   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
 9099   match(Set dst (VectorMaskFirstTrue mask));
 9100   match(Set dst (VectorMaskLastTrue mask));
 9101   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9102   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9103   ins_encode %{
 9104     int opcode = this->ideal_Opcode();
 9105     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9106     int mask_len = Matcher::vector_length(this, $mask);
 9107     int vlen_enc = vector_length_encoding(this, $mask);
 9108     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9109                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9110   %}
 9111   ins_pipe( pipe_slow );
 9112 %}
 9113 
 9114 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
 9115   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
 9116   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
 9117   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
 9118   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
 9119   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
 9120   ins_encode %{
 9121     int opcode = this->ideal_Opcode();
 9122     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
 9123     int mask_len = Matcher::vector_length(this, $mask);
 9124     int vlen_enc = vector_length_encoding(this, $mask);
 9125     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
 9126                              $tmp$$Register, mask_len, mbt, vlen_enc);
 9127   %}
 9128   ins_pipe( pipe_slow );
 9129 %}
 9130 
 9131 // --------------------------------- Compress/Expand Operations ---------------------------
 9132 
 9133 instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
 9134   match(Set dst (CompressV src mask));
 9135   match(Set dst (ExpandV src mask));
 9136   format %{ "vector_compress_expand $dst, $src, $mask" %}
 9137   ins_encode %{
 9138     int opcode = this->ideal_Opcode();
 9139     int vector_len = vector_length_encoding(this);
 9140     BasicType bt  = Matcher::vector_element_basic_type(this);
 9141     __ vector_compress_expand(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$KRegister, false, bt, vector_len);
 9142   %}
 9143   ins_pipe( pipe_slow );
 9144 %}
 9145 
 9146 instruct vcompress_mask_reg_evex(kReg dst, kReg mask, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
 9147   match(Set dst (CompressM mask));
 9148   effect(TEMP rtmp1, TEMP rtmp2, KILL cr);
 9149   format %{ "mask_compress_evex $dst, $mask\t! using $rtmp1 and $rtmp2 as TEMP" %}
 9150   ins_encode %{
 9151     assert(this->in(1)->bottom_type()->isa_vectmask(), "");
 9152     int mask_len = Matcher::vector_length(this);
 9153     __ vector_mask_compress($dst$$KRegister, $mask$$KRegister, $rtmp1$$Register, $rtmp2$$Register, mask_len);
 9154   %}
 9155   ins_pipe( pipe_slow );
 9156 %}
 9157 
 9158 #endif // _LP64
 9159 
 9160 // -------------------------------- Bit and Byte Reversal Vector Operations ------------------------
 9161 
 9162 instruct vreverse_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9163   predicate(!VM_Version::supports_gfni());
 9164   match(Set dst (ReverseV src));
 9165   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9166   format %{ "vector_reverse_bit_evex $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9167   ins_encode %{
 9168     int vec_enc = vector_length_encoding(this);
 9169     BasicType bt = Matcher::vector_element_basic_type(this);
 9170     __ vector_reverse_bit(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9171                           $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9172   %}
 9173   ins_pipe( pipe_slow );
 9174 %}
 9175 
 9176 instruct vreverse_reg_gfni(vec dst, vec src, vec xtmp, rRegI rtmp) %{
 9177   predicate(VM_Version::supports_gfni());
 9178   match(Set dst (ReverseV src));
 9179   effect(TEMP dst, TEMP xtmp, TEMP rtmp);
 9180   format %{ "vector_reverse_bit_gfni $dst, $src!\t using $rtmp and $xtmp as TEMP" %}
 9181   ins_encode %{
 9182     int vec_enc = vector_length_encoding(this);
 9183     BasicType bt  = Matcher::vector_element_basic_type(this);
 9184     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, 0x8040201008040201L, 1));
 9185     __ vector_reverse_bit_gfni(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
 9186                                addr, $rtmp$$Register, vec_enc);
 9187   %}
 9188   ins_pipe( pipe_slow );
 9189 %}
 9190 
 9191 instruct vreverse_byte_reg(vec dst, vec src, rRegI rtmp) %{
 9192   predicate(VM_Version::supports_avx512bw() || Matcher::vector_length_in_bytes(n) < 64);
 9193   match(Set dst (ReverseBytesV src));
 9194   effect(TEMP dst, TEMP rtmp);
 9195   format %{ "vector_reverse_byte $dst, $src!\t using $rtmp as TEMP" %}
 9196   ins_encode %{
 9197     int vec_enc = vector_length_encoding(this);
 9198     BasicType bt = Matcher::vector_element_basic_type(this);
 9199     __ vector_reverse_byte(bt, $dst$$XMMRegister, $src$$XMMRegister, $rtmp$$Register, vec_enc);
 9200   %}
 9201   ins_pipe( pipe_slow );
 9202 %}
 9203 
 9204 instruct vreverse_byte64_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegI rtmp) %{
 9205   predicate(!VM_Version::supports_avx512bw() && Matcher::vector_length_in_bytes(n) == 64);
 9206   match(Set dst (ReverseBytesV src));
 9207   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp);
 9208   format %{ "vector_reverse_byte $dst, $src!\t using $xtmp1, $xtmp2 and $rtmp as TEMP" %}
 9209   ins_encode %{
 9210     int vec_enc = vector_length_encoding(this);
 9211     BasicType bt = Matcher::vector_element_basic_type(this);
 9212     __ vector_reverse_byte64(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9213                              $xtmp2$$XMMRegister, $rtmp$$Register, vec_enc);
 9214   %}
 9215   ins_pipe( pipe_slow );
 9216 %}
 9217 
 9218 // ---------------------------------- Vector Count Leading Zeros -----------------------------------
 9219 
 9220 instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
 9221   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9222                                               Matcher::vector_length_in_bytes(n->in(1))));
 9223   match(Set dst (CountLeadingZerosV src));
 9224   format %{ "vector_count_leading_zeros $dst, $src" %}
 9225   ins_encode %{
 9226      int vlen_enc = vector_length_encoding(this, $src);
 9227      BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9228      BasicType rbt = Matcher::vector_element_basic_type(this);
 9229      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
 9230                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
 9231      // TODO: Once auto-vectorizer supports ConvL2I operation, CountLeadingZerosV
 9232      // should be succeeded by its corresponding vector IR and following
 9233      // special handling should be removed.
 9234      if (rbt == T_INT && bt == T_LONG) {
 9235        __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 9236      }
 9237   %}
 9238   ins_pipe( pipe_slow );
 9239 %}
 9240 
 9241 instruct vcount_leading_zeros_IL_reg_evex_masked(vec dst, vec src, kReg mask) %{
 9242   predicate(is_clz_non_subword_predicate_evex(Matcher::vector_element_basic_type(n->in(1)),
 9243                                               Matcher::vector_length_in_bytes(n->in(1))));
 9244   match(Set dst (CountLeadingZerosV src mask));
 9245   format %{ "vector_count_leading_zeros $dst, $src, $mask" %}
 9246   ins_encode %{
 9247     int vlen_enc = vector_length_encoding(this, $src);
 9248     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9249     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
 9250     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg,
 9251                                        xnoreg, $mask$$KRegister, noreg, true, vlen_enc);
 9252   %}
 9253   ins_pipe( pipe_slow );
 9254 %}
 9255 
 9256 instruct vcount_leading_zeros_short_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2) %{
 9257   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_SHORT &&
 9258             VM_Version::supports_avx512cd() &&
 9259             (VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64));
 9260   match(Set dst (CountLeadingZerosV src));
 9261   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2);
 9262   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1 and $xtmp2 as TEMP" %}
 9263   ins_encode %{
 9264     int vlen_enc = vector_length_encoding(this, $src);
 9265     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9266     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9267                                        $xtmp2$$XMMRegister, xnoreg, k0, noreg, true, vlen_enc);
 9268   %}
 9269   ins_pipe( pipe_slow );
 9270 %}
 9271 
 9272 instruct vcount_leading_zeros_byte_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, kReg ktmp, rRegP rtmp) %{
 9273   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_BYTE && VM_Version::supports_avx512vlbw());
 9274   match(Set dst (CountLeadingZerosV src));
 9275   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP ktmp, TEMP rtmp);
 9276   format %{ "vector_count_leading_zeros $dst, $src!\t using $xtmp1, $xtmp2, $xtmp3, $ktmp and $rtmp as TEMP" %}
 9277   ins_encode %{
 9278     int vlen_enc = vector_length_encoding(this, $src);
 9279     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9280     __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9281                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $ktmp$$KRegister,
 9282                                        $rtmp$$Register, true, vlen_enc);
 9283   %}
 9284   ins_pipe( pipe_slow );
 9285 %}
 9286 
 9287 instruct vcount_leading_zeros_int_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3) %{
 9288   predicate(Matcher::vector_element_basic_type(n->in(1)) == T_INT &&
 9289             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9290   match(Set dst (CountLeadingZerosV src));
 9291   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3);
 9292   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2 and $xtmp3 as TEMP" %}
 9293   ins_encode %{
 9294     int vlen_enc = vector_length_encoding(this, $src);
 9295     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9296     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9297                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, noreg, vlen_enc);
 9298   %}
 9299   ins_pipe( pipe_slow );
 9300 %}
 9301 
 9302 instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp) %{
 9303   predicate(Matcher::vector_element_basic_type(n->in(1)) != T_INT &&
 9304             !VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n->in(1)) < 64);
 9305   match(Set dst (CountLeadingZerosV src));
 9306   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp);
 9307   format %{ "vector_count_leading_zeros $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
 9308   ins_encode %{
 9309     int vlen_enc = vector_length_encoding(this, $src);
 9310     BasicType bt = Matcher::vector_element_basic_type(this, $src);
 9311     BasicType rbt = Matcher::vector_element_basic_type(this);
 9312     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
 9313                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
 9314     // TODO: Once auto-vectorizer supports ConvL2I operation, CountLeadingZerosV
 9315     // should be succeeded by its corresponding vector IR and following
 9316     // special handling should be removed.
 9317     if (rbt == T_INT && bt == T_LONG) {
 9318       __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
 9319     }
 9320   %}
 9321   ins_pipe( pipe_slow );
 9322 %}
 9323 
 9324 // ---------------------------------- Vector Masked Operations ------------------------------------
 9325 
 9326 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
 9327   match(Set dst (AddVB (Binary dst src2) mask));
 9328   match(Set dst (AddVS (Binary dst src2) mask));
 9329   match(Set dst (AddVI (Binary dst src2) mask));
 9330   match(Set dst (AddVL (Binary dst src2) mask));
 9331   match(Set dst (AddVF (Binary dst src2) mask));
 9332   match(Set dst (AddVD (Binary dst src2) mask));
 9333   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9334   ins_encode %{
 9335     int vlen_enc = vector_length_encoding(this);
 9336     BasicType bt = Matcher::vector_element_basic_type(this);
 9337     int opc = this->ideal_Opcode();
 9338     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9339                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9340   %}
 9341   ins_pipe( pipe_slow );
 9342 %}
 9343 
 9344 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
 9345   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
 9346   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
 9347   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
 9348   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
 9349   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
 9350   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
 9351   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
 9352   ins_encode %{
 9353     int vlen_enc = vector_length_encoding(this);
 9354     BasicType bt = Matcher::vector_element_basic_type(this);
 9355     int opc = this->ideal_Opcode();
 9356     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9357                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9358   %}
 9359   ins_pipe( pipe_slow );
 9360 %}
 9361 
 9362 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
 9363   match(Set dst (XorV (Binary dst src2) mask));
 9364   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9365   ins_encode %{
 9366     int vlen_enc = vector_length_encoding(this);
 9367     BasicType bt = Matcher::vector_element_basic_type(this);
 9368     int opc = this->ideal_Opcode();
 9369     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9370                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9371   %}
 9372   ins_pipe( pipe_slow );
 9373 %}
 9374 
 9375 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
 9376   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
 9377   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
 9378   ins_encode %{
 9379     int vlen_enc = vector_length_encoding(this);
 9380     BasicType bt = Matcher::vector_element_basic_type(this);
 9381     int opc = this->ideal_Opcode();
 9382     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9383                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9384   %}
 9385   ins_pipe( pipe_slow );
 9386 %}
 9387 
 9388 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
 9389   match(Set dst (OrV (Binary dst src2) mask));
 9390   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9391   ins_encode %{
 9392     int vlen_enc = vector_length_encoding(this);
 9393     BasicType bt = Matcher::vector_element_basic_type(this);
 9394     int opc = this->ideal_Opcode();
 9395     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9396                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9397   %}
 9398   ins_pipe( pipe_slow );
 9399 %}
 9400 
 9401 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
 9402   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
 9403   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
 9404   ins_encode %{
 9405     int vlen_enc = vector_length_encoding(this);
 9406     BasicType bt = Matcher::vector_element_basic_type(this);
 9407     int opc = this->ideal_Opcode();
 9408     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9409                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9410   %}
 9411   ins_pipe( pipe_slow );
 9412 %}
 9413 
 9414 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
 9415   match(Set dst (AndV (Binary dst src2) mask));
 9416   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9417   ins_encode %{
 9418     int vlen_enc = vector_length_encoding(this);
 9419     BasicType bt = Matcher::vector_element_basic_type(this);
 9420     int opc = this->ideal_Opcode();
 9421     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9422                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9423   %}
 9424   ins_pipe( pipe_slow );
 9425 %}
 9426 
 9427 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
 9428   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
 9429   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
 9430   ins_encode %{
 9431     int vlen_enc = vector_length_encoding(this);
 9432     BasicType bt = Matcher::vector_element_basic_type(this);
 9433     int opc = this->ideal_Opcode();
 9434     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9435                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9436   %}
 9437   ins_pipe( pipe_slow );
 9438 %}
 9439 
 9440 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
 9441   match(Set dst (SubVB (Binary dst src2) mask));
 9442   match(Set dst (SubVS (Binary dst src2) mask));
 9443   match(Set dst (SubVI (Binary dst src2) mask));
 9444   match(Set dst (SubVL (Binary dst src2) mask));
 9445   match(Set dst (SubVF (Binary dst src2) mask));
 9446   match(Set dst (SubVD (Binary dst src2) mask));
 9447   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9448   ins_encode %{
 9449     int vlen_enc = vector_length_encoding(this);
 9450     BasicType bt = Matcher::vector_element_basic_type(this);
 9451     int opc = this->ideal_Opcode();
 9452     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9453                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9454   %}
 9455   ins_pipe( pipe_slow );
 9456 %}
 9457 
 9458 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
 9459   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
 9460   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
 9461   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
 9462   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
 9463   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
 9464   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
 9465   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
 9466   ins_encode %{
 9467     int vlen_enc = vector_length_encoding(this);
 9468     BasicType bt = Matcher::vector_element_basic_type(this);
 9469     int opc = this->ideal_Opcode();
 9470     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9471                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9472   %}
 9473   ins_pipe( pipe_slow );
 9474 %}
 9475 
 9476 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
 9477   match(Set dst (MulVS (Binary dst src2) mask));
 9478   match(Set dst (MulVI (Binary dst src2) mask));
 9479   match(Set dst (MulVL (Binary dst src2) mask));
 9480   match(Set dst (MulVF (Binary dst src2) mask));
 9481   match(Set dst (MulVD (Binary dst src2) mask));
 9482   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9483   ins_encode %{
 9484     int vlen_enc = vector_length_encoding(this);
 9485     BasicType bt = Matcher::vector_element_basic_type(this);
 9486     int opc = this->ideal_Opcode();
 9487     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9488                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9489   %}
 9490   ins_pipe( pipe_slow );
 9491 %}
 9492 
 9493 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
 9494   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
 9495   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
 9496   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
 9497   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
 9498   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
 9499   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
 9500   ins_encode %{
 9501     int vlen_enc = vector_length_encoding(this);
 9502     BasicType bt = Matcher::vector_element_basic_type(this);
 9503     int opc = this->ideal_Opcode();
 9504     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9505                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9506   %}
 9507   ins_pipe( pipe_slow );
 9508 %}
 9509 
 9510 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
 9511   match(Set dst (SqrtVF dst mask));
 9512   match(Set dst (SqrtVD dst mask));
 9513   ins_cost(100);
 9514   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
 9515   ins_encode %{
 9516     int vlen_enc = vector_length_encoding(this);
 9517     BasicType bt = Matcher::vector_element_basic_type(this);
 9518     int opc = this->ideal_Opcode();
 9519     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9520                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9521   %}
 9522   ins_pipe( pipe_slow );
 9523 %}
 9524 
 9525 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
 9526   match(Set dst (DivVF (Binary dst src2) mask));
 9527   match(Set dst (DivVD (Binary dst src2) mask));
 9528   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9529   ins_encode %{
 9530     int vlen_enc = vector_length_encoding(this);
 9531     BasicType bt = Matcher::vector_element_basic_type(this);
 9532     int opc = this->ideal_Opcode();
 9533     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9534                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9535   %}
 9536   ins_pipe( pipe_slow );
 9537 %}
 9538 
 9539 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
 9540   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
 9541   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
 9542   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
 9543   ins_encode %{
 9544     int vlen_enc = vector_length_encoding(this);
 9545     BasicType bt = Matcher::vector_element_basic_type(this);
 9546     int opc = this->ideal_Opcode();
 9547     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9548                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9549   %}
 9550   ins_pipe( pipe_slow );
 9551 %}
 9552 
 9553 
 9554 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9555   match(Set dst (RotateLeftV (Binary dst shift) mask));
 9556   match(Set dst (RotateRightV (Binary dst shift) mask));
 9557   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
 9558   ins_encode %{
 9559     int vlen_enc = vector_length_encoding(this);
 9560     BasicType bt = Matcher::vector_element_basic_type(this);
 9561     int opc = this->ideal_Opcode();
 9562     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9563                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9564   %}
 9565   ins_pipe( pipe_slow );
 9566 %}
 9567 
 9568 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
 9569   match(Set dst (RotateLeftV (Binary dst src2) mask));
 9570   match(Set dst (RotateRightV (Binary dst src2) mask));
 9571   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
 9572   ins_encode %{
 9573     int vlen_enc = vector_length_encoding(this);
 9574     BasicType bt = Matcher::vector_element_basic_type(this);
 9575     int opc = this->ideal_Opcode();
 9576     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9577                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9578   %}
 9579   ins_pipe( pipe_slow );
 9580 %}
 9581 
 9582 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9583   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
 9584   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
 9585   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
 9586   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
 9587   ins_encode %{
 9588     int vlen_enc = vector_length_encoding(this);
 9589     BasicType bt = Matcher::vector_element_basic_type(this);
 9590     int opc = this->ideal_Opcode();
 9591     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9592                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9593   %}
 9594   ins_pipe( pipe_slow );
 9595 %}
 9596 
 9597 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9598   predicate(!n->as_ShiftV()->is_var_shift());
 9599   match(Set dst (LShiftVS (Binary dst src2) mask));
 9600   match(Set dst (LShiftVI (Binary dst src2) mask));
 9601   match(Set dst (LShiftVL (Binary dst src2) mask));
 9602   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9603   ins_encode %{
 9604     int vlen_enc = vector_length_encoding(this);
 9605     BasicType bt = Matcher::vector_element_basic_type(this);
 9606     int opc = this->ideal_Opcode();
 9607     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9608                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9609   %}
 9610   ins_pipe( pipe_slow );
 9611 %}
 9612 
 9613 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9614   predicate(n->as_ShiftV()->is_var_shift());
 9615   match(Set dst (LShiftVS (Binary dst src2) mask));
 9616   match(Set dst (LShiftVI (Binary dst src2) mask));
 9617   match(Set dst (LShiftVL (Binary dst src2) mask));
 9618   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9619   ins_encode %{
 9620     int vlen_enc = vector_length_encoding(this);
 9621     BasicType bt = Matcher::vector_element_basic_type(this);
 9622     int opc = this->ideal_Opcode();
 9623     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9624                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9625   %}
 9626   ins_pipe( pipe_slow );
 9627 %}
 9628 
 9629 instruct vlshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9630   match(Set dst (LShiftVS (Binary dst (LoadVector src2)) mask));
 9631   match(Set dst (LShiftVI (Binary dst (LoadVector src2)) mask));
 9632   match(Set dst (LShiftVL (Binary dst (LoadVector src2)) mask));
 9633   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
 9634   ins_encode %{
 9635     int vlen_enc = vector_length_encoding(this);
 9636     BasicType bt = Matcher::vector_element_basic_type(this);
 9637     int opc = this->ideal_Opcode();
 9638     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9639                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9640   %}
 9641   ins_pipe( pipe_slow );
 9642 %}
 9643 
 9644 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9645   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
 9646   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
 9647   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
 9648   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
 9649   ins_encode %{
 9650     int vlen_enc = vector_length_encoding(this);
 9651     BasicType bt = Matcher::vector_element_basic_type(this);
 9652     int opc = this->ideal_Opcode();
 9653     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9654                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9655   %}
 9656   ins_pipe( pipe_slow );
 9657 %}
 9658 
 9659 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9660   predicate(!n->as_ShiftV()->is_var_shift());
 9661   match(Set dst (RShiftVS (Binary dst src2) mask));
 9662   match(Set dst (RShiftVI (Binary dst src2) mask));
 9663   match(Set dst (RShiftVL (Binary dst src2) mask));
 9664   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9665   ins_encode %{
 9666     int vlen_enc = vector_length_encoding(this);
 9667     BasicType bt = Matcher::vector_element_basic_type(this);
 9668     int opc = this->ideal_Opcode();
 9669     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9670                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9671   %}
 9672   ins_pipe( pipe_slow );
 9673 %}
 9674 
 9675 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9676   predicate(n->as_ShiftV()->is_var_shift());
 9677   match(Set dst (RShiftVS (Binary dst src2) mask));
 9678   match(Set dst (RShiftVI (Binary dst src2) mask));
 9679   match(Set dst (RShiftVL (Binary dst src2) mask));
 9680   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9681   ins_encode %{
 9682     int vlen_enc = vector_length_encoding(this);
 9683     BasicType bt = Matcher::vector_element_basic_type(this);
 9684     int opc = this->ideal_Opcode();
 9685     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9686                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9687   %}
 9688   ins_pipe( pipe_slow );
 9689 %}
 9690 
 9691 instruct vrshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9692   match(Set dst (RShiftVS (Binary dst (LoadVector src2)) mask));
 9693   match(Set dst (RShiftVI (Binary dst (LoadVector src2)) mask));
 9694   match(Set dst (RShiftVL (Binary dst (LoadVector src2)) mask));
 9695   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
 9696   ins_encode %{
 9697     int vlen_enc = vector_length_encoding(this);
 9698     BasicType bt = Matcher::vector_element_basic_type(this);
 9699     int opc = this->ideal_Opcode();
 9700     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9701                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9702   %}
 9703   ins_pipe( pipe_slow );
 9704 %}
 9705 
 9706 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
 9707   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
 9708   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
 9709   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
 9710   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
 9711   ins_encode %{
 9712     int vlen_enc = vector_length_encoding(this);
 9713     BasicType bt = Matcher::vector_element_basic_type(this);
 9714     int opc = this->ideal_Opcode();
 9715     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9716                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
 9717   %}
 9718   ins_pipe( pipe_slow );
 9719 %}
 9720 
 9721 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
 9722   predicate(!n->as_ShiftV()->is_var_shift());
 9723   match(Set dst (URShiftVS (Binary dst src2) mask));
 9724   match(Set dst (URShiftVI (Binary dst src2) mask));
 9725   match(Set dst (URShiftVL (Binary dst src2) mask));
 9726   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9727   ins_encode %{
 9728     int vlen_enc = vector_length_encoding(this);
 9729     BasicType bt = Matcher::vector_element_basic_type(this);
 9730     int opc = this->ideal_Opcode();
 9731     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9732                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
 9733   %}
 9734   ins_pipe( pipe_slow );
 9735 %}
 9736 
 9737 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
 9738   predicate(n->as_ShiftV()->is_var_shift());
 9739   match(Set dst (URShiftVS (Binary dst src2) mask));
 9740   match(Set dst (URShiftVI (Binary dst src2) mask));
 9741   match(Set dst (URShiftVL (Binary dst src2) mask));
 9742   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9743   ins_encode %{
 9744     int vlen_enc = vector_length_encoding(this);
 9745     BasicType bt = Matcher::vector_element_basic_type(this);
 9746     int opc = this->ideal_Opcode();
 9747     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9748                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
 9749   %}
 9750   ins_pipe( pipe_slow );
 9751 %}
 9752 
 9753 instruct vurshift_mem_masked(vec dst, memory src2, kReg mask) %{
 9754   match(Set dst (URShiftVS (Binary dst (LoadVector src2)) mask));
 9755   match(Set dst (URShiftVI (Binary dst (LoadVector src2)) mask));
 9756   match(Set dst (URShiftVL (Binary dst (LoadVector src2)) mask));
 9757   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
 9758   ins_encode %{
 9759     int vlen_enc = vector_length_encoding(this);
 9760     BasicType bt = Matcher::vector_element_basic_type(this);
 9761     int opc = this->ideal_Opcode();
 9762     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9763                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9764   %}
 9765   ins_pipe( pipe_slow );
 9766 %}
 9767 
 9768 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
 9769   match(Set dst (MaxV (Binary dst src2) mask));
 9770   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
 9771   ins_encode %{
 9772     int vlen_enc = vector_length_encoding(this);
 9773     BasicType bt = Matcher::vector_element_basic_type(this);
 9774     int opc = this->ideal_Opcode();
 9775     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9776                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9777   %}
 9778   ins_pipe( pipe_slow );
 9779 %}
 9780 
 9781 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
 9782   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
 9783   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
 9784   ins_encode %{
 9785     int vlen_enc = vector_length_encoding(this);
 9786     BasicType bt = Matcher::vector_element_basic_type(this);
 9787     int opc = this->ideal_Opcode();
 9788     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9789                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9790   %}
 9791   ins_pipe( pipe_slow );
 9792 %}
 9793 
 9794 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
 9795   match(Set dst (MinV (Binary dst src2) mask));
 9796   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
 9797   ins_encode %{
 9798     int vlen_enc = vector_length_encoding(this);
 9799     BasicType bt = Matcher::vector_element_basic_type(this);
 9800     int opc = this->ideal_Opcode();
 9801     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9802                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
 9803   %}
 9804   ins_pipe( pipe_slow );
 9805 %}
 9806 
 9807 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
 9808   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
 9809   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
 9810   ins_encode %{
 9811     int vlen_enc = vector_length_encoding(this);
 9812     BasicType bt = Matcher::vector_element_basic_type(this);
 9813     int opc = this->ideal_Opcode();
 9814     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9815                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
 9816   %}
 9817   ins_pipe( pipe_slow );
 9818 %}
 9819 
 9820 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
 9821   match(Set dst (VectorRearrange (Binary dst src2) mask));
 9822   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
 9823   ins_encode %{
 9824     int vlen_enc = vector_length_encoding(this);
 9825     BasicType bt = Matcher::vector_element_basic_type(this);
 9826     int opc = this->ideal_Opcode();
 9827     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9828                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
 9829   %}
 9830   ins_pipe( pipe_slow );
 9831 %}
 9832 
 9833 instruct vabs_masked(vec dst, kReg mask) %{
 9834   match(Set dst (AbsVB dst mask));
 9835   match(Set dst (AbsVS dst mask));
 9836   match(Set dst (AbsVI dst mask));
 9837   match(Set dst (AbsVL dst mask));
 9838   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
 9839   ins_cost(100);
 9840   ins_encode %{
 9841     int vlen_enc = vector_length_encoding(this);
 9842     BasicType bt = Matcher::vector_element_basic_type(this);
 9843     int opc = this->ideal_Opcode();
 9844     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9845                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
 9846   %}
 9847   ins_pipe( pipe_slow );
 9848 %}
 9849 
 9850 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
 9851   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
 9852   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
 9853   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
 9854   ins_encode %{
 9855     int vlen_enc = vector_length_encoding(this);
 9856     BasicType bt = Matcher::vector_element_basic_type(this);
 9857     int opc = this->ideal_Opcode();
 9858     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9859                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
 9860   %}
 9861   ins_pipe( pipe_slow );
 9862 %}
 9863 
 9864 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
 9865   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
 9866   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
 9867   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
 9868   ins_encode %{
 9869     int vlen_enc = vector_length_encoding(this);
 9870     BasicType bt = Matcher::vector_element_basic_type(this);
 9871     int opc = this->ideal_Opcode();
 9872     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
 9873                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
 9874   %}
 9875   ins_pipe( pipe_slow );
 9876 %}
 9877 
 9878 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask, rRegP scratch) %{
 9879   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
 9880   effect(TEMP scratch);
 9881   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask\t! using $scratch as TEMP" %}
 9882   ins_encode %{
 9883     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
 9884     int vlen_enc = vector_length_encoding(this, $src1);
 9885     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
 9886 
 9887     // Comparison i
 9888     switch (src1_elem_bt) {
 9889       case T_BYTE: {
 9890         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
 9891         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9892         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9893         break;
 9894       }
 9895       case T_SHORT: {
 9896         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
 9897         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9898         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9899         break;
 9900       }
 9901       case T_INT: {
 9902         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
 9903         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9904         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9905         break;
 9906       }
 9907       case T_LONG: {
 9908         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
 9909         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
 9910         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
 9911         break;
 9912       }
 9913       case T_FLOAT: {
 9914         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 9915         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 9916         break;
 9917       }
 9918       case T_DOUBLE: {
 9919         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
 9920         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
 9921         break;
 9922       }
 9923       default: assert(false, "%s", type2name(src1_elem_bt)); break;
 9924     }
 9925   %}
 9926   ins_pipe( pipe_slow );
 9927 %}
 9928 
 9929 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
 9930   predicate(Matcher::vector_length(n) <= 32);
 9931   match(Set dst (MaskAll src));
 9932   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
 9933   ins_encode %{
 9934     int mask_len = Matcher::vector_length(this);
 9935     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
 9936   %}
 9937   ins_pipe( pipe_slow );
 9938 %}
 9939 
 9940 #ifdef _LP64
 9941 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
 9942   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
 9943   match(Set dst (XorVMask src (MaskAll cnt)));
 9944   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
 9945   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
 9946   ins_encode %{
 9947     uint masklen = Matcher::vector_length(this);
 9948     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
 9949   %}
 9950   ins_pipe( pipe_slow );
 9951 %}
 9952 
 9953 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
 9954   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
 9955             (Matcher::vector_length(n) == 16) ||
 9956             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
 9957   match(Set dst (XorVMask src (MaskAll cnt)));
 9958   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
 9959   ins_encode %{
 9960     uint masklen = Matcher::vector_length(this);
 9961     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
 9962   %}
 9963   ins_pipe( pipe_slow );
 9964 %}
 9965 
 9966 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
 9967   predicate(n->bottom_type()->isa_vectmask() == NULL && Matcher::vector_length(n) <= 8);
 9968   match(Set dst (VectorLongToMask src));
 9969   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
 9970   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
 9971   ins_encode %{
 9972     int mask_len = Matcher::vector_length(this);
 9973     int vec_enc  = vector_length_encoding(mask_len);
 9974     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
 9975                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
 9976   %}
 9977   ins_pipe( pipe_slow );
 9978 %}
 9979 
 9980 
 9981 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
 9982   predicate(n->bottom_type()->isa_vectmask() == NULL && Matcher::vector_length(n) > 8);
 9983   match(Set dst (VectorLongToMask src));
 9984   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
 9985   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
 9986   ins_encode %{
 9987     int mask_len = Matcher::vector_length(this);
 9988     assert(mask_len <= 32, "invalid mask length");
 9989     int vec_enc  = vector_length_encoding(mask_len);
 9990     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
 9991                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
 9992   %}
 9993   ins_pipe( pipe_slow );
 9994 %}
 9995 
 9996 instruct long_to_mask_evex(kReg dst, rRegL src) %{
 9997   predicate(n->bottom_type()->isa_vectmask());
 9998   match(Set dst (VectorLongToMask src));
 9999   format %{ "long_to_mask_evex $dst, $src\t!" %}
10000   ins_encode %{
10001     __ kmov($dst$$KRegister, $src$$Register);
10002   %}
10003   ins_pipe( pipe_slow );
10004 %}
10005 #endif
10006 
10007 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
10008   match(Set dst (AndVMask src1 src2));
10009   match(Set dst (OrVMask src1 src2));
10010   match(Set dst (XorVMask src1 src2));
10011   effect(TEMP kscratch);
10012   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
10013   ins_encode %{
10014     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
10015     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
10016     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
10017     uint masklen = Matcher::vector_length(this);
10018     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
10019     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
10020   %}
10021   ins_pipe( pipe_slow );
10022 %}
10023 
10024 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
10025   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10026   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10027   ins_encode %{
10028     int vlen_enc = vector_length_encoding(this);
10029     BasicType bt = Matcher::vector_element_basic_type(this);
10030     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10031                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
10032   %}
10033   ins_pipe( pipe_slow );
10034 %}
10035 
10036 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
10037   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
10038   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
10039   ins_encode %{
10040     int vlen_enc = vector_length_encoding(this);
10041     BasicType bt = Matcher::vector_element_basic_type(this);
10042     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
10043                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
10044   %}
10045   ins_pipe( pipe_slow );
10046 %}
10047 
10048 instruct castMM(kReg dst)
10049 %{
10050   match(Set dst (CastVV dst));
10051 
10052   size(0);
10053   format %{ "# castVV of $dst" %}
10054   ins_encode(/* empty encoding */);
10055   ins_cost(0);
10056   ins_pipe(empty);
10057 %}
10058 
10059 instruct castVV(vec dst)
10060 %{
10061   match(Set dst (CastVV dst));
10062 
10063   size(0);
10064   format %{ "# castVV of $dst" %}
10065   ins_encode(/* empty encoding */);
10066   ins_cost(0);
10067   ins_pipe(empty);
10068 %}
10069 
10070 instruct castVVLeg(legVec dst)
10071 %{
10072   match(Set dst (CastVV dst));
10073 
10074   size(0);
10075   format %{ "# castVV of $dst" %}
10076   ins_encode(/* empty encoding */);
10077   ins_cost(0);
10078   ins_pipe(empty);
10079 %}