1 //
   2 // Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 // AVX3 Mask Registers.
 632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
 633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
 634 
 635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
 636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
 637 
 638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
 639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
 640 
 641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
 642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
 643 
 644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
 645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
 646 
 647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
 648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
 649 
 650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
 651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
 652 
 653 
 654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 662 #ifdef _LP64
 663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 687 #endif
 688                       );
 689 
 690 alloc_class chunk2(K7, K7_H,
 691                    K6, K6_H,
 692                    K5, K5_H,
 693                    K4, K4_H,
 694                    K3, K3_H,
 695                    K2, K2_H,
 696                    K1, K1_H);
 697 
 698 reg_class  vectmask_reg(K1, K1_H,
 699                         K2, K2_H,
 700                         K3, K3_H,
 701                         K4, K4_H,
 702                         K5, K5_H,
 703                         K6, K6_H,
 704                         K7, K7_H);
 705 
 706 reg_class vectmask_reg_K1(K1, K1_H);
 707 reg_class vectmask_reg_K2(K2, K2_H);
 708 reg_class vectmask_reg_K3(K3, K3_H);
 709 reg_class vectmask_reg_K4(K4, K4_H);
 710 reg_class vectmask_reg_K5(K5, K5_H);
 711 reg_class vectmask_reg_K6(K6, K6_H);
 712 reg_class vectmask_reg_K7(K7, K7_H);
 713 
 714 // flags allocation class should be last.
 715 alloc_class chunk3(RFLAGS);
 716 
 717 
 718 // Singleton class for condition codes
 719 reg_class int_flags(RFLAGS);
 720 
 721 // Class for pre evex float registers
 722 reg_class float_reg_legacy(XMM0,
 723                     XMM1,
 724                     XMM2,
 725                     XMM3,
 726                     XMM4,
 727                     XMM5,
 728                     XMM6,
 729                     XMM7
 730 #ifdef _LP64
 731                    ,XMM8,
 732                     XMM9,
 733                     XMM10,
 734                     XMM11,
 735                     XMM12,
 736                     XMM13,
 737                     XMM14,
 738                     XMM15
 739 #endif
 740                     );
 741 
 742 // Class for evex float registers
 743 reg_class float_reg_evex(XMM0,
 744                     XMM1,
 745                     XMM2,
 746                     XMM3,
 747                     XMM4,
 748                     XMM5,
 749                     XMM6,
 750                     XMM7
 751 #ifdef _LP64
 752                    ,XMM8,
 753                     XMM9,
 754                     XMM10,
 755                     XMM11,
 756                     XMM12,
 757                     XMM13,
 758                     XMM14,
 759                     XMM15,
 760                     XMM16,
 761                     XMM17,
 762                     XMM18,
 763                     XMM19,
 764                     XMM20,
 765                     XMM21,
 766                     XMM22,
 767                     XMM23,
 768                     XMM24,
 769                     XMM25,
 770                     XMM26,
 771                     XMM27,
 772                     XMM28,
 773                     XMM29,
 774                     XMM30,
 775                     XMM31
 776 #endif
 777                     );
 778 
 779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 781 
 782 // Class for pre evex double registers
 783 reg_class double_reg_legacy(XMM0,  XMM0b,
 784                      XMM1,  XMM1b,
 785                      XMM2,  XMM2b,
 786                      XMM3,  XMM3b,
 787                      XMM4,  XMM4b,
 788                      XMM5,  XMM5b,
 789                      XMM6,  XMM6b,
 790                      XMM7,  XMM7b
 791 #ifdef _LP64
 792                     ,XMM8,  XMM8b,
 793                      XMM9,  XMM9b,
 794                      XMM10, XMM10b,
 795                      XMM11, XMM11b,
 796                      XMM12, XMM12b,
 797                      XMM13, XMM13b,
 798                      XMM14, XMM14b,
 799                      XMM15, XMM15b
 800 #endif
 801                      );
 802 
 803 // Class for evex double registers
 804 reg_class double_reg_evex(XMM0,  XMM0b,
 805                      XMM1,  XMM1b,
 806                      XMM2,  XMM2b,
 807                      XMM3,  XMM3b,
 808                      XMM4,  XMM4b,
 809                      XMM5,  XMM5b,
 810                      XMM6,  XMM6b,
 811                      XMM7,  XMM7b
 812 #ifdef _LP64
 813                     ,XMM8,  XMM8b,
 814                      XMM9,  XMM9b,
 815                      XMM10, XMM10b,
 816                      XMM11, XMM11b,
 817                      XMM12, XMM12b,
 818                      XMM13, XMM13b,
 819                      XMM14, XMM14b,
 820                      XMM15, XMM15b,
 821                      XMM16, XMM16b,
 822                      XMM17, XMM17b,
 823                      XMM18, XMM18b,
 824                      XMM19, XMM19b,
 825                      XMM20, XMM20b,
 826                      XMM21, XMM21b,
 827                      XMM22, XMM22b,
 828                      XMM23, XMM23b,
 829                      XMM24, XMM24b,
 830                      XMM25, XMM25b,
 831                      XMM26, XMM26b,
 832                      XMM27, XMM27b,
 833                      XMM28, XMM28b,
 834                      XMM29, XMM29b,
 835                      XMM30, XMM30b,
 836                      XMM31, XMM31b
 837 #endif
 838                      );
 839 
 840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 842 
 843 // Class for pre evex 32bit vector registers
 844 reg_class vectors_reg_legacy(XMM0,
 845                       XMM1,
 846                       XMM2,
 847                       XMM3,
 848                       XMM4,
 849                       XMM5,
 850                       XMM6,
 851                       XMM7
 852 #ifdef _LP64
 853                      ,XMM8,
 854                       XMM9,
 855                       XMM10,
 856                       XMM11,
 857                       XMM12,
 858                       XMM13,
 859                       XMM14,
 860                       XMM15
 861 #endif
 862                       );
 863 
 864 // Class for evex 32bit vector registers
 865 reg_class vectors_reg_evex(XMM0,
 866                       XMM1,
 867                       XMM2,
 868                       XMM3,
 869                       XMM4,
 870                       XMM5,
 871                       XMM6,
 872                       XMM7
 873 #ifdef _LP64
 874                      ,XMM8,
 875                       XMM9,
 876                       XMM10,
 877                       XMM11,
 878                       XMM12,
 879                       XMM13,
 880                       XMM14,
 881                       XMM15,
 882                       XMM16,
 883                       XMM17,
 884                       XMM18,
 885                       XMM19,
 886                       XMM20,
 887                       XMM21,
 888                       XMM22,
 889                       XMM23,
 890                       XMM24,
 891                       XMM25,
 892                       XMM26,
 893                       XMM27,
 894                       XMM28,
 895                       XMM29,
 896                       XMM30,
 897                       XMM31
 898 #endif
 899                       );
 900 
 901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 903 
 904 // Class for all 64bit vector registers
 905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 906                       XMM1,  XMM1b,
 907                       XMM2,  XMM2b,
 908                       XMM3,  XMM3b,
 909                       XMM4,  XMM4b,
 910                       XMM5,  XMM5b,
 911                       XMM6,  XMM6b,
 912                       XMM7,  XMM7b
 913 #ifdef _LP64
 914                      ,XMM8,  XMM8b,
 915                       XMM9,  XMM9b,
 916                       XMM10, XMM10b,
 917                       XMM11, XMM11b,
 918                       XMM12, XMM12b,
 919                       XMM13, XMM13b,
 920                       XMM14, XMM14b,
 921                       XMM15, XMM15b
 922 #endif
 923                       );
 924 
 925 // Class for all 64bit vector registers
 926 reg_class vectord_reg_evex(XMM0,  XMM0b,
 927                       XMM1,  XMM1b,
 928                       XMM2,  XMM2b,
 929                       XMM3,  XMM3b,
 930                       XMM4,  XMM4b,
 931                       XMM5,  XMM5b,
 932                       XMM6,  XMM6b,
 933                       XMM7,  XMM7b
 934 #ifdef _LP64
 935                      ,XMM8,  XMM8b,
 936                       XMM9,  XMM9b,
 937                       XMM10, XMM10b,
 938                       XMM11, XMM11b,
 939                       XMM12, XMM12b,
 940                       XMM13, XMM13b,
 941                       XMM14, XMM14b,
 942                       XMM15, XMM15b,
 943                       XMM16, XMM16b,
 944                       XMM17, XMM17b,
 945                       XMM18, XMM18b,
 946                       XMM19, XMM19b,
 947                       XMM20, XMM20b,
 948                       XMM21, XMM21b,
 949                       XMM22, XMM22b,
 950                       XMM23, XMM23b,
 951                       XMM24, XMM24b,
 952                       XMM25, XMM25b,
 953                       XMM26, XMM26b,
 954                       XMM27, XMM27b,
 955                       XMM28, XMM28b,
 956                       XMM29, XMM29b,
 957                       XMM30, XMM30b,
 958                       XMM31, XMM31b
 959 #endif
 960                       );
 961 
 962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 964 
 965 // Class for all 128bit vector registers
 966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 973                       XMM7,  XMM7b,  XMM7c,  XMM7d
 974 #ifdef _LP64
 975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 977                       XMM10, XMM10b, XMM10c, XMM10d,
 978                       XMM11, XMM11b, XMM11c, XMM11d,
 979                       XMM12, XMM12b, XMM12c, XMM12d,
 980                       XMM13, XMM13b, XMM13c, XMM13d,
 981                       XMM14, XMM14b, XMM14c, XMM14d,
 982                       XMM15, XMM15b, XMM15c, XMM15d
 983 #endif
 984                       );
 985 
 986 // Class for all 128bit vector registers
 987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 994                       XMM7,  XMM7b,  XMM7c,  XMM7d
 995 #ifdef _LP64
 996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 998                       XMM10, XMM10b, XMM10c, XMM10d,
 999                       XMM11, XMM11b, XMM11c, XMM11d,
1000                       XMM12, XMM12b, XMM12c, XMM12d,
1001                       XMM13, XMM13b, XMM13c, XMM13d,
1002                       XMM14, XMM14b, XMM14c, XMM14d,
1003                       XMM15, XMM15b, XMM15c, XMM15d,
1004                       XMM16, XMM16b, XMM16c, XMM16d,
1005                       XMM17, XMM17b, XMM17c, XMM17d,
1006                       XMM18, XMM18b, XMM18c, XMM18d,
1007                       XMM19, XMM19b, XMM19c, XMM19d,
1008                       XMM20, XMM20b, XMM20c, XMM20d,
1009                       XMM21, XMM21b, XMM21c, XMM21d,
1010                       XMM22, XMM22b, XMM22c, XMM22d,
1011                       XMM23, XMM23b, XMM23c, XMM23d,
1012                       XMM24, XMM24b, XMM24c, XMM24d,
1013                       XMM25, XMM25b, XMM25c, XMM25d,
1014                       XMM26, XMM26b, XMM26c, XMM26d,
1015                       XMM27, XMM27b, XMM27c, XMM27d,
1016                       XMM28, XMM28b, XMM28c, XMM28d,
1017                       XMM29, XMM29b, XMM29c, XMM29d,
1018                       XMM30, XMM30b, XMM30c, XMM30d,
1019                       XMM31, XMM31b, XMM31c, XMM31d
1020 #endif
1021                       );
1022 
1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1025 
1026 // Class for all 256bit vector registers
1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1035 #ifdef _LP64
1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
1044 #endif
1045                       );
1046 
1047 // Class for all 256bit vector registers
1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1056 #ifdef _LP64
1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1081 #endif
1082                       );
1083 
1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1086 
1087 // Class for all 512bit vector registers
1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1096 #ifdef _LP64
1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1121 #endif
1122                       );
1123 
1124 // Class for restricted 512bit vector registers
1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1133 #ifdef _LP64
1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1142 #endif
1143                       );
1144 
1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1147 
1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1149 %}
1150 
1151 
1152 //----------SOURCE BLOCK-------------------------------------------------------
1153 // This is a block of C++ code which provides values, functions, and
1154 // definitions necessary in the rest of the architecture description
1155 
1156 source_hpp %{
1157 // Header information of the source block.
1158 // Method declarations/definitions which are used outside
1159 // the ad-scope can conveniently be defined here.
1160 //
1161 // To keep related declarations/definitions/uses close together,
1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1163 
1164 #include "runtime/vm_version.hpp"
1165 
1166 class NativeJump;
1167 
1168 class CallStubImpl {
1169 
1170   //--------------------------------------------------------------
1171   //---<  Used for optimization in Compile::shorten_branches  >---
1172   //--------------------------------------------------------------
1173 
1174  public:
1175   // Size of call trampoline stub.
1176   static uint size_call_trampoline() {
1177     return 0; // no call trampolines on this platform
1178   }
1179 
1180   // number of relocations needed by a call trampoline stub
1181   static uint reloc_call_trampoline() {
1182     return 0; // no call trampolines on this platform
1183   }
1184 };
1185 
1186 class HandlerImpl {
1187 
1188  public:
1189 
1190   static int emit_exception_handler(CodeBuffer &cbuf);
1191   static int emit_deopt_handler(CodeBuffer& cbuf);
1192 
1193   static uint size_exception_handler() {
1194     // NativeCall instruction size is the same as NativeJump.
1195     // exception handler starts out as jump and can be patched to
1196     // a call be deoptimization.  (4932387)
1197     // Note that this value is also credited (in output.cpp) to
1198     // the size of the code section.
1199     return NativeJump::instruction_size;
1200   }
1201 
1202 #ifdef _LP64
1203   static uint size_deopt_handler() {
1204     // three 5 byte instructions plus one move for unreachable address.
1205     return 15+3;
1206   }
1207 #else
1208   static uint size_deopt_handler() {
1209     // NativeCall instruction size is the same as NativeJump.
1210     // exception handler starts out as jump and can be patched to
1211     // a call be deoptimization.  (4932387)
1212     // Note that this value is also credited (in output.cpp) to
1213     // the size of the code section.
1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1215   }
1216 #endif
1217 };
1218 
1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
1220   switch(bytes) {
1221     case  4: // fall-through
1222     case  8: // fall-through
1223     case 16: return Assembler::AVX_128bit;
1224     case 32: return Assembler::AVX_256bit;
1225     case 64: return Assembler::AVX_512bit;
1226 
1227     default: {
1228       ShouldNotReachHere();
1229       return Assembler::AVX_NoVec;
1230     }
1231   }
1232 }
1233 
1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
1236 }
1237 
1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
1239   uint def_idx = use->operand_index(opnd);
1240   Node* def = use->in(def_idx);
1241   return vector_length_encoding(def);
1242 }
1243 
1244 static inline bool is_unsigned_booltest_pred(int bt) {
1245   return  ((bt & BoolTest::unsigned_compare) == BoolTest::unsigned_compare);
1246 }
1247 
1248 class Node::PD {
1249 public:
1250   enum NodeFlags {
1251     Flag_intel_jcc_erratum = Node::_last_flag << 1,
1252     _last_flag             = Flag_intel_jcc_erratum
1253   };
1254 };
1255 
1256 %} // end source_hpp
1257 
1258 source %{
1259 
1260 #include "opto/addnode.hpp"
1261 #include "c2_intelJccErratum_x86.hpp"
1262 
1263 void PhaseOutput::pd_perform_mach_node_analysis() {
1264   if (VM_Version::has_intel_jcc_erratum()) {
1265     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
1266     _buf_sizes._code += extra_padding;
1267   }
1268 }
1269 
1270 int MachNode::pd_alignment_required() const {
1271   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
1272     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
1273     return IntelJccErratum::largest_jcc_size() + 1;
1274   } else {
1275     return 1;
1276   }
1277 }
1278 
1279 int MachNode::compute_padding(int current_offset) const {
1280   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
1281     Compile* C = Compile::current();
1282     PhaseOutput* output = C->output();
1283     Block* block = output->block();
1284     int index = output->index();
1285     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
1286   } else {
1287     return 0;
1288   }
1289 }
1290 
1291 // Emit exception handler code.
1292 // Stuff framesize into a register and call a VM stub routine.
1293 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1294 
1295   // Note that the code buffer's insts_mark is always relative to insts.
1296   // That's why we must use the macroassembler to generate a handler.
1297   C2_MacroAssembler _masm(&cbuf);
1298   address base = __ start_a_stub(size_exception_handler());
1299   if (base == NULL) {
1300     ciEnv::current()->record_failure("CodeCache is full");
1301     return 0;  // CodeBuffer::expand failed
1302   }
1303   int offset = __ offset();
1304   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1305   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1306   __ end_a_stub();
1307   return offset;
1308 }
1309 
1310 // Emit deopt handler code.
1311 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1312 
1313   // Note that the code buffer's insts_mark is always relative to insts.
1314   // That's why we must use the macroassembler to generate a handler.
1315   C2_MacroAssembler _masm(&cbuf);
1316   address base = __ start_a_stub(size_deopt_handler());
1317   if (base == NULL) {
1318     ciEnv::current()->record_failure("CodeCache is full");
1319     return 0;  // CodeBuffer::expand failed
1320   }
1321   int offset = __ offset();
1322 
1323 #ifdef _LP64
1324   address the_pc = (address) __ pc();
1325   Label next;
1326   // push a "the_pc" on the stack without destroying any registers
1327   // as they all may be live.
1328 
1329   // push address of "next"
1330   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1331   __ bind(next);
1332   // adjust it so it matches "the_pc"
1333   __ subptr(Address(rsp, 0), __ offset() - offset);
1334 #else
1335   InternalAddress here(__ pc());
1336   __ pushptr(here.addr());
1337 #endif
1338 
1339   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1340   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1341   __ end_a_stub();
1342   return offset;
1343 }
1344 
1345 Assembler::Width widthForType(BasicType bt) {
1346   if (bt == T_BYTE) {
1347     return Assembler::B;
1348   } else if (bt == T_SHORT) {
1349     return Assembler::W;
1350   } else if (bt == T_INT) {
1351     return Assembler::D;
1352   } else {
1353     assert(bt == T_LONG, "not a long: %s", type2name(bt));
1354     return Assembler::Q;
1355   }
1356 }
1357 
1358 //=============================================================================
1359 
1360   // Float masks come from different places depending on platform.
1361 #ifdef _LP64
1362   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1363   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1364   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1365   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1366 #else
1367   static address float_signmask()  { return (address)float_signmask_pool; }
1368   static address float_signflip()  { return (address)float_signflip_pool; }
1369   static address double_signmask() { return (address)double_signmask_pool; }
1370   static address double_signflip() { return (address)double_signflip_pool; }
1371 #endif
1372   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1373   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1374   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1375   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1376   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1377   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
1378   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1379   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
1380   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1381   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1382   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1383   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1384   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1385   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
1386   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
1387 
1388 //=============================================================================
1389 const bool Matcher::match_rule_supported(int opcode) {
1390   if (!has_match_rule(opcode)) {
1391     return false; // no match rule present
1392   }
1393   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1394   switch (opcode) {
1395     case Op_AbsVL:
1396     case Op_StoreVectorScatter:
1397       if (UseAVX < 3) {
1398         return false;
1399       }
1400       break;
1401     case Op_PopCountI:
1402     case Op_PopCountL:
1403       if (!UsePopCountInstruction) {
1404         return false;
1405       }
1406       break;
1407     case Op_PopCountVI:
1408     case Op_PopCountVL:
1409       if (!UsePopCountInstruction || !VM_Version::supports_avx512_vpopcntdq()) {
1410         return false;
1411       }
1412       break;
1413     case Op_MulVI:
1414       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
1415         return false;
1416       }
1417       break;
1418     case Op_MulVL:
1419       if (UseSSE < 4) { // only with SSE4_1 or AVX
1420         return false;
1421       }
1422       break;
1423     case Op_MulReductionVL:
1424       if (VM_Version::supports_avx512dq() == false) {
1425         return false;
1426       }
1427       break;
1428     case Op_AddReductionVL:
1429       if (UseSSE < 2) { // requires at least SSE2
1430         return false;
1431       }
1432       break;
1433     case Op_AbsVB:
1434     case Op_AbsVS:
1435     case Op_AbsVI:
1436     case Op_AddReductionVI:
1437     case Op_AndReductionV:
1438     case Op_OrReductionV:
1439     case Op_XorReductionV:
1440       if (UseSSE < 3) { // requires at least SSSE3
1441         return false;
1442       }
1443       break;
1444     case Op_VectorLoadShuffle:
1445     case Op_VectorRearrange:
1446     case Op_MulReductionVI:
1447       if (UseSSE < 4) { // requires at least SSE4
1448         return false;
1449       }
1450       break;
1451     case Op_SqrtVD:
1452     case Op_SqrtVF:
1453     case Op_VectorMaskCmp:
1454     case Op_VectorCastB2X:
1455     case Op_VectorCastS2X:
1456     case Op_VectorCastI2X:
1457     case Op_VectorCastL2X:
1458     case Op_VectorCastF2X:
1459     case Op_VectorCastD2X:
1460       if (UseAVX < 1) { // enabled for AVX only
1461         return false;
1462       }
1463       break;
1464     case Op_CompareAndSwapL:
1465 #ifdef _LP64
1466     case Op_CompareAndSwapP:
1467 #endif
1468       if (!VM_Version::supports_cx8()) {
1469         return false;
1470       }
1471       break;
1472     case Op_CMoveVF:
1473     case Op_CMoveVD:
1474       if (UseAVX < 1) { // enabled for AVX only
1475         return false;
1476       }
1477       break;
1478     case Op_StrIndexOf:
1479       if (!UseSSE42Intrinsics) {
1480         return false;
1481       }
1482       break;
1483     case Op_StrIndexOfChar:
1484       if (!UseSSE42Intrinsics) {
1485         return false;
1486       }
1487       break;
1488     case Op_OnSpinWait:
1489       if (VM_Version::supports_on_spin_wait() == false) {
1490         return false;
1491       }
1492       break;
1493     case Op_MulVB:
1494     case Op_LShiftVB:
1495     case Op_RShiftVB:
1496     case Op_URShiftVB:
1497     case Op_VectorInsert:
1498     case Op_VectorLoadMask:
1499     case Op_VectorStoreMask:
1500     case Op_VectorBlend:
1501       if (UseSSE < 4) {
1502         return false;
1503       }
1504       break;
1505 #ifdef _LP64
1506     case Op_MaxD:
1507     case Op_MaxF:
1508     case Op_MinD:
1509     case Op_MinF:
1510       if (UseAVX < 1) { // enabled for AVX only
1511         return false;
1512       }
1513       break;
1514 #endif
1515     case Op_CacheWB:
1516     case Op_CacheWBPreSync:
1517     case Op_CacheWBPostSync:
1518       if (!VM_Version::supports_data_cache_line_flush()) {
1519         return false;
1520       }
1521       break;
1522     case Op_ExtractB:
1523     case Op_ExtractL:
1524     case Op_ExtractI:
1525     case Op_RoundDoubleMode:
1526       if (UseSSE < 4) {
1527         return false;
1528       }
1529       break;
1530     case Op_RoundDoubleModeV:
1531       if (VM_Version::supports_avx() == false) {
1532         return false; // 128bit vroundpd is not available
1533       }
1534       break;
1535     case Op_LoadVectorGather:
1536       if (UseAVX < 2) {
1537         return false;
1538       }
1539       break;
1540     case Op_FmaVD:
1541     case Op_FmaVF:
1542       if (!UseFMA) {
1543         return false;
1544       }
1545       break;
1546     case Op_MacroLogicV:
1547       if (UseAVX < 3 || !UseVectorMacroLogic) {
1548         return false;
1549       }
1550       break;
1551 
1552     case Op_VectorCmpMasked:
1553     case Op_VectorMaskGen:
1554     case Op_LoadVectorMasked:
1555     case Op_StoreVectorMasked:
1556       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
1557         return false;
1558       }
1559       break;
1560     case Op_VectorMaskFirstTrue:
1561     case Op_VectorMaskLastTrue:
1562     case Op_VectorMaskTrueCount:
1563     case Op_VectorMaskToLong:
1564       if (!is_LP64 || UseAVX < 1) {
1565          return false;
1566       }
1567       break;
1568     case Op_CopySignD:
1569     case Op_CopySignF:
1570       if (UseAVX < 3 || !is_LP64)  {
1571         return false;
1572       }
1573       if (!VM_Version::supports_avx512vl()) {
1574         return false;
1575       }
1576       break;
1577 #ifndef _LP64
1578     case Op_AddReductionVF:
1579     case Op_AddReductionVD:
1580     case Op_MulReductionVF:
1581     case Op_MulReductionVD:
1582       if (UseSSE < 1) { // requires at least SSE
1583         return false;
1584       }
1585       break;
1586     case Op_MulAddVS2VI:
1587     case Op_RShiftVL:
1588     case Op_AbsVD:
1589     case Op_NegVD:
1590       if (UseSSE < 2) {
1591         return false;
1592       }
1593       break;
1594 #endif // !LP64
1595     case Op_SignumF:
1596       if (UseSSE < 1) {
1597         return false;
1598       }
1599       break;
1600     case Op_SignumD:
1601       if (UseSSE < 2) {
1602         return false;
1603       }
1604       break;
1605     case Op_SqrtF:
1606       if (UseSSE < 1) {
1607         return false;
1608       }
1609       break;
1610     case Op_SqrtD:
1611       if (UseSSE < 2) {
1612         return false;
1613       }
1614       break;
1615   }
1616   return true;  // Match rules are supported by default.
1617 }
1618 
1619 //------------------------------------------------------------------------
1620 
1621 // Identify extra cases that we might want to provide match rules for vector nodes and
1622 // other intrinsics guarded with vector length (vlen) and element type (bt).
1623 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
1624   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1625   if (!match_rule_supported(opcode)) {
1626     return false;
1627   }
1628   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
1629   //   * SSE2 supports 128bit vectors for all types;
1630   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
1631   //   * AVX2 supports 256bit vectors for all types;
1632   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
1633   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1634   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1635   // And MaxVectorSize is taken into account as well.
1636   if (!vector_size_supported(bt, vlen)) {
1637     return false;
1638   }
1639   // Special cases which require vector length follow:
1640   //   * implementation limitations
1641   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1642   //   * 128bit vroundpd instruction is present only in AVX1
1643   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1644   switch (opcode) {
1645     case Op_AbsVF:
1646     case Op_NegVF:
1647       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1648         return false; // 512bit vandps and vxorps are not available
1649       }
1650       break;
1651     case Op_AbsVD:
1652     case Op_NegVD:
1653     case Op_MulVL:
1654       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1655         return false; // 512bit vpmullq, vandpd and vxorpd are not available
1656       }
1657       break;
1658     case Op_CMoveVF:
1659       if (vlen != 8) {
1660         return false; // implementation limitation (only vcmov8F_reg is present)
1661       }
1662       break;
1663     case Op_RotateRightV:
1664     case Op_RotateLeftV:
1665       if (bt != T_INT && bt != T_LONG) {
1666         return false;
1667       } // fallthrough
1668     case Op_MacroLogicV:
1669       if (!VM_Version::supports_evex() ||
1670           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
1671         return false;
1672       }
1673       break;
1674     case Op_ClearArray:
1675     case Op_VectorMaskGen:
1676     case Op_VectorCmpMasked:
1677     case Op_LoadVectorMasked:
1678     case Op_StoreVectorMasked:
1679       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
1680         return false;
1681       }
1682       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
1683         return false;
1684       }
1685       break;
1686     case Op_CMoveVD:
1687       if (vlen != 4) {
1688         return false; // implementation limitation (only vcmov4D_reg is present)
1689       }
1690       break;
1691     case Op_MaxV:
1692     case Op_MinV:
1693       if (UseSSE < 4 && is_integral_type(bt)) {
1694         return false;
1695       }
1696       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
1697           // Float/Double intrinsics are enabled for AVX family currently.
1698           if (UseAVX == 0) {
1699             return false;
1700           }
1701           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
1702             return false;
1703           }
1704       }
1705       break;
1706     case Op_CallLeafVector:
1707       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
1708         return false;
1709       }
1710       break;
1711     case Op_AddReductionVI:
1712       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
1713         return false;
1714       }
1715       // fallthrough
1716     case Op_AndReductionV:
1717     case Op_OrReductionV:
1718     case Op_XorReductionV:
1719       if (is_subword_type(bt) && (UseSSE < 4)) {
1720         return false;
1721       }
1722 #ifndef _LP64
1723       if (bt == T_BYTE || bt == T_LONG) {
1724         return false;
1725       }
1726 #endif
1727       break;
1728 #ifndef _LP64
1729     case Op_VectorInsert:
1730       if (bt == T_LONG || bt == T_DOUBLE) {
1731         return false;
1732       }
1733       break;
1734 #endif
1735     case Op_MinReductionV:
1736     case Op_MaxReductionV:
1737       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
1738         return false;
1739       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
1740         return false;
1741       }
1742       // Float/Double intrinsics enabled for AVX family.
1743       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
1744         return false;
1745       }
1746       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
1747         return false;
1748       }
1749 #ifndef _LP64
1750       if (bt == T_BYTE || bt == T_LONG) {
1751         return false;
1752       }
1753 #endif
1754       break;
1755     case Op_VectorTest:
1756       if (UseSSE < 4) {
1757         return false; // Implementation limitation
1758       } else if (size_in_bits < 32) {
1759         return false; // Implementation limitation
1760       } else if (size_in_bits == 512 && (VM_Version::supports_avx512bw() == false)) {
1761         return false; // Implementation limitation
1762       }
1763       break;
1764     case Op_VectorLoadShuffle:
1765     case Op_VectorRearrange:
1766       if(vlen == 2) {
1767         return false; // Implementation limitation due to how shuffle is loaded
1768       } else if (size_in_bits == 256 && UseAVX < 2) {
1769         return false; // Implementation limitation
1770       } else if (bt == T_BYTE && size_in_bits > 256 && !VM_Version::supports_avx512_vbmi())  {
1771         return false; // Implementation limitation
1772       } else if (bt == T_SHORT && size_in_bits > 256 && !VM_Version::supports_avx512bw())  {
1773         return false; // Implementation limitation
1774       }
1775       break;
1776     case Op_VectorLoadMask:
1777       if (size_in_bits == 256 && UseAVX < 2) {
1778         return false; // Implementation limitation
1779       }
1780       // fallthrough
1781     case Op_VectorStoreMask:
1782       if (vlen == 2) {
1783         return false; // Implementation limitation
1784       }
1785       break;
1786     case Op_VectorCastB2X:
1787     case Op_VectorCastS2X:
1788     case Op_VectorCastI2X:
1789       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
1790         return false;
1791       }
1792       break;
1793     case Op_VectorCastL2X:
1794       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1795         return false;
1796       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
1797         return false;
1798       }
1799       break;
1800     case Op_VectorCastD2X:
1801       if (is_subword_type(bt) || bt == T_INT) {
1802         return false;
1803       }
1804       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
1805         return false;
1806       }
1807       break;
1808     case Op_VectorCastF2X:
1809       if (is_subword_type(bt) || bt == T_LONG) {
1810         return false;
1811       }
1812       break;
1813     case Op_MulReductionVI:
1814       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
1815         return false;
1816       }
1817       break;
1818     case Op_LoadVectorGatherMasked:
1819     case Op_StoreVectorScatterMasked:
1820     case Op_StoreVectorScatter:
1821       if(is_subword_type(bt)) {
1822         return false;
1823       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1824         return false;
1825       }
1826       // fallthrough
1827     case Op_LoadVectorGather:
1828       if (size_in_bits == 64 ) {
1829         return false;
1830       }
1831       break;
1832     case Op_MaskAll:
1833       if (!VM_Version::supports_evex()) {
1834         return false;
1835       }
1836       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
1837         return false;
1838       }
1839       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1840         return false;
1841       }
1842       break;
1843     case Op_VectorMaskCmp:
1844       if (vlen < 2 || size_in_bits < 32) {
1845         return false;
1846       }
1847       break;
1848     case Op_VectorLongToMask:
1849       if (UseAVX < 1 || !is_LP64) {
1850         return false;
1851       }
1852       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
1853         return false;
1854       }
1855       break;
1856   }
1857   return true;  // Per default match rules are supported.
1858 }
1859 
1860 const bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
1861   // ADLC based match_rule_supported routine checks for the existence of pattern based
1862   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
1863   // of their non-masked counterpart with mask edge being the differentiator.
1864   // This routine does a strict check on the existence of masked operation patterns
1865   // by returning a default false value for all the other opcodes apart from the
1866   // ones whose masked instruction patterns are defined in this file.
1867   if (!match_rule_supported_vector(opcode, vlen, bt)) {
1868     return false;
1869   }
1870 
1871   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1872   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1873   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
1874     return false;
1875   }
1876   switch(opcode) {
1877     // Unary masked operations
1878     case Op_AbsVB:
1879     case Op_AbsVS:
1880       if(!VM_Version::supports_avx512bw()) {
1881         return false;  // Implementation limitation
1882       }
1883     case Op_AbsVI:
1884     case Op_AbsVL:
1885       return true;
1886 
1887     // Ternary masked operations
1888     case Op_FmaVF:
1889     case Op_FmaVD:
1890       return true;
1891 
1892     case Op_MacroLogicV:
1893       if(bt != T_INT && bt != T_LONG) {
1894         return false;
1895       }
1896       return true;
1897 
1898     // Binary masked operations
1899     case Op_AddVB:
1900     case Op_AddVS:
1901     case Op_SubVB:
1902     case Op_SubVS:
1903     case Op_MulVS:
1904     case Op_LShiftVS:
1905     case Op_RShiftVS:
1906     case Op_URShiftVS:
1907       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
1908       if (!VM_Version::supports_avx512bw()) {
1909         return false;  // Implementation limitation
1910       }
1911       return true;
1912 
1913     case Op_MulVL:
1914       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
1915       if (!VM_Version::supports_avx512dq()) {
1916         return false;  // Implementation limitation
1917       }
1918       return true;
1919 
1920     case Op_AndV:
1921     case Op_OrV:
1922     case Op_XorV:
1923     case Op_RotateRightV:
1924     case Op_RotateLeftV:
1925       if (bt != T_INT && bt != T_LONG) {
1926         return false; // Implementation limitation
1927       }
1928       return true;
1929 
1930     case Op_VectorLoadMask:
1931       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
1932       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
1933         return false;
1934       }
1935       return true;
1936 
1937     case Op_AddVI:
1938     case Op_AddVL:
1939     case Op_AddVF:
1940     case Op_AddVD:
1941     case Op_SubVI:
1942     case Op_SubVL:
1943     case Op_SubVF:
1944     case Op_SubVD:
1945     case Op_MulVI:
1946     case Op_MulVF:
1947     case Op_MulVD:
1948     case Op_DivVF:
1949     case Op_DivVD:
1950     case Op_SqrtVF:
1951     case Op_SqrtVD:
1952     case Op_LShiftVI:
1953     case Op_LShiftVL:
1954     case Op_RShiftVI:
1955     case Op_RShiftVL:
1956     case Op_URShiftVI:
1957     case Op_URShiftVL:
1958     case Op_LoadVectorMasked:
1959     case Op_StoreVectorMasked:
1960     case Op_LoadVectorGatherMasked:
1961     case Op_StoreVectorScatterMasked:
1962       return true;
1963 
1964     case Op_MaxV:
1965     case Op_MinV:
1966       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
1967         return false; // Implementation limitation
1968       }
1969       if (is_floating_point_type(bt)) {
1970         return false; // Implementation limitation
1971       }
1972       return true;
1973 
1974     case Op_VectorMaskCmp:
1975       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
1976         return false; // Implementation limitation
1977       }
1978       return true;
1979 
1980     case Op_VectorRearrange:
1981       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
1982         return false; // Implementation limitation
1983       }
1984       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
1985         return false; // Implementation limitation
1986       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
1987         return false; // Implementation limitation
1988       }
1989       return true;
1990 
1991     // Binary Logical operations
1992     case Op_AndVMask:
1993     case Op_OrVMask:
1994     case Op_XorVMask:
1995       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
1996         return false; // Implementation limitation
1997       }
1998       return true;
1999 
2000     case Op_MaskAll:
2001       return true;
2002 
2003     default:
2004       return false;
2005   }
2006 }
2007 
2008 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
2009   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
2010   bool legacy = (generic_opnd->opcode() == LEGVEC);
2011   if (!VM_Version::supports_avx512vlbwdq() && // KNL
2012       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
2013     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
2014     return new legVecZOper();
2015   }
2016   if (legacy) {
2017     switch (ideal_reg) {
2018       case Op_VecS: return new legVecSOper();
2019       case Op_VecD: return new legVecDOper();
2020       case Op_VecX: return new legVecXOper();
2021       case Op_VecY: return new legVecYOper();
2022       case Op_VecZ: return new legVecZOper();
2023     }
2024   } else {
2025     switch (ideal_reg) {
2026       case Op_VecS: return new vecSOper();
2027       case Op_VecD: return new vecDOper();
2028       case Op_VecX: return new vecXOper();
2029       case Op_VecY: return new vecYOper();
2030       case Op_VecZ: return new vecZOper();
2031     }
2032   }
2033   ShouldNotReachHere();
2034   return NULL;
2035 }
2036 
2037 bool Matcher::is_reg2reg_move(MachNode* m) {
2038   switch (m->rule()) {
2039     case MoveVec2Leg_rule:
2040     case MoveLeg2Vec_rule:
2041     case MoveF2VL_rule:
2042     case MoveF2LEG_rule:
2043     case MoveVL2F_rule:
2044     case MoveLEG2F_rule:
2045     case MoveD2VL_rule:
2046     case MoveD2LEG_rule:
2047     case MoveVL2D_rule:
2048     case MoveLEG2D_rule:
2049       return true;
2050     default:
2051       return false;
2052   }
2053 }
2054 
2055 bool Matcher::is_generic_vector(MachOper* opnd) {
2056   switch (opnd->opcode()) {
2057     case VEC:
2058     case LEGVEC:
2059       return true;
2060     default:
2061       return false;
2062   }
2063 }
2064 
2065 //------------------------------------------------------------------------
2066 
2067 const RegMask* Matcher::predicate_reg_mask(void) {
2068   return &_VECTMASK_REG_mask;
2069 }
2070 
2071 const TypeVectMask* Matcher::predicate_reg_type(const Type* elemTy, int length) {
2072   return new TypeVectMask(elemTy, length);
2073 }
2074 
2075 // Max vector size in bytes. 0 if not supported.
2076 const int Matcher::vector_width_in_bytes(BasicType bt) {
2077   assert(is_java_primitive(bt), "only primitive type vectors");
2078   if (UseSSE < 2) return 0;
2079   // SSE2 supports 128bit vectors for all types.
2080   // AVX2 supports 256bit vectors for all types.
2081   // AVX2/EVEX supports 512bit vectors for all types.
2082   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
2083   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
2084   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
2085     size = (UseAVX > 2) ? 64 : 32;
2086   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
2087     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
2088   // Use flag to limit vector size.
2089   size = MIN2(size,(int)MaxVectorSize);
2090   // Minimum 2 values in vector (or 4 for bytes).
2091   switch (bt) {
2092   case T_DOUBLE:
2093   case T_LONG:
2094     if (size < 16) return 0;
2095     break;
2096   case T_FLOAT:
2097   case T_INT:
2098     if (size < 8) return 0;
2099     break;
2100   case T_BOOLEAN:
2101     if (size < 4) return 0;
2102     break;
2103   case T_CHAR:
2104     if (size < 4) return 0;
2105     break;
2106   case T_BYTE:
2107     if (size < 4) return 0;
2108     break;
2109   case T_SHORT:
2110     if (size < 4) return 0;
2111     break;
2112   default:
2113     ShouldNotReachHere();
2114   }
2115   return size;
2116 }
2117 
2118 // Limits on vector size (number of elements) loaded into vector.
2119 const int Matcher::max_vector_size(const BasicType bt) {
2120   return vector_width_in_bytes(bt)/type2aelembytes(bt);
2121 }
2122 const int Matcher::min_vector_size(const BasicType bt) {
2123   int max_size = max_vector_size(bt);
2124   // Min size which can be loaded into vector is 4 bytes.
2125   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
2126   // Support for calling svml double64 vectors
2127   if (bt == T_DOUBLE) {
2128     size = 1;
2129   }
2130   return MIN2(size,max_size);
2131 }
2132 
2133 const int Matcher::scalable_vector_reg_size(const BasicType bt) {
2134   return -1;
2135 }
2136 
2137 // Vector ideal reg corresponding to specified size in bytes
2138 const uint Matcher::vector_ideal_reg(int size) {
2139   assert(MaxVectorSize >= size, "");
2140   switch(size) {
2141     case  4: return Op_VecS;
2142     case  8: return Op_VecD;
2143     case 16: return Op_VecX;
2144     case 32: return Op_VecY;
2145     case 64: return Op_VecZ;
2146   }
2147   ShouldNotReachHere();
2148   return 0;
2149 }
2150 
2151 // Check for shift by small constant as well
2152 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
2153   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
2154       shift->in(2)->get_int() <= 3 &&
2155       // Are there other uses besides address expressions?
2156       !matcher->is_visited(shift)) {
2157     address_visited.set(shift->_idx); // Flag as address_visited
2158     mstack.push(shift->in(2), Matcher::Visit);
2159     Node *conv = shift->in(1);
2160 #ifdef _LP64
2161     // Allow Matcher to match the rule which bypass
2162     // ConvI2L operation for an array index on LP64
2163     // if the index value is positive.
2164     if (conv->Opcode() == Op_ConvI2L &&
2165         conv->as_Type()->type()->is_long()->_lo >= 0 &&
2166         // Are there other uses besides address expressions?
2167         !matcher->is_visited(conv)) {
2168       address_visited.set(conv->_idx); // Flag as address_visited
2169       mstack.push(conv->in(1), Matcher::Pre_Visit);
2170     } else
2171 #endif
2172       mstack.push(conv, Matcher::Pre_Visit);
2173     return true;
2174   }
2175   return false;
2176 }
2177 
2178 // This function identifies sub-graphs in which a 'load' node is
2179 // input to two different nodes, and such that it can be matched
2180 // with BMI instructions like blsi, blsr, etc.
2181 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
2182 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
2183 // refers to the same node.
2184 //
2185 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
2186 // This is a temporary solution until we make DAGs expressible in ADL.
2187 template<typename ConType>
2188 class FusedPatternMatcher {
2189   Node* _op1_node;
2190   Node* _mop_node;
2191   int _con_op;
2192 
2193   static int match_next(Node* n, int next_op, int next_op_idx) {
2194     if (n->in(1) == NULL || n->in(2) == NULL) {
2195       return -1;
2196     }
2197 
2198     if (next_op_idx == -1) { // n is commutative, try rotations
2199       if (n->in(1)->Opcode() == next_op) {
2200         return 1;
2201       } else if (n->in(2)->Opcode() == next_op) {
2202         return 2;
2203       }
2204     } else {
2205       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
2206       if (n->in(next_op_idx)->Opcode() == next_op) {
2207         return next_op_idx;
2208       }
2209     }
2210     return -1;
2211   }
2212 
2213  public:
2214   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
2215     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
2216 
2217   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
2218              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
2219              typename ConType::NativeType con_value) {
2220     if (_op1_node->Opcode() != op1) {
2221       return false;
2222     }
2223     if (_mop_node->outcnt() > 2) {
2224       return false;
2225     }
2226     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
2227     if (op1_op2_idx == -1) {
2228       return false;
2229     }
2230     // Memory operation must be the other edge
2231     int op1_mop_idx = (op1_op2_idx & 1) + 1;
2232 
2233     // Check that the mop node is really what we want
2234     if (_op1_node->in(op1_mop_idx) == _mop_node) {
2235       Node* op2_node = _op1_node->in(op1_op2_idx);
2236       if (op2_node->outcnt() > 1) {
2237         return false;
2238       }
2239       assert(op2_node->Opcode() == op2, "Should be");
2240       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
2241       if (op2_con_idx == -1) {
2242         return false;
2243       }
2244       // Memory operation must be the other edge
2245       int op2_mop_idx = (op2_con_idx & 1) + 1;
2246       // Check that the memory operation is the same node
2247       if (op2_node->in(op2_mop_idx) == _mop_node) {
2248         // Now check the constant
2249         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
2250         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
2251           return true;
2252         }
2253       }
2254     }
2255     return false;
2256   }
2257 };
2258 
2259 static bool is_bmi_pattern(Node* n, Node* m) {
2260   assert(UseBMI1Instructions, "sanity");
2261   if (n != NULL && m != NULL) {
2262     if (m->Opcode() == Op_LoadI) {
2263       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
2264       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
2265              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
2266              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
2267     } else if (m->Opcode() == Op_LoadL) {
2268       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
2269       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
2270              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
2271              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
2272     }
2273   }
2274   return false;
2275 }
2276 
2277 // Should the matcher clone input 'm' of node 'n'?
2278 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
2279   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
2280   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
2281     mstack.push(m, Visit);
2282     return true;
2283   }
2284   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
2285     mstack.push(m, Visit);           // m = ShiftCntV
2286     return true;
2287   }
2288   return false;
2289 }
2290 
2291 // Should the Matcher clone shifts on addressing modes, expecting them
2292 // to be subsumed into complex addressing expressions or compute them
2293 // into registers?
2294 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
2295   Node *off = m->in(AddPNode::Offset);
2296   if (off->is_Con()) {
2297     address_visited.test_set(m->_idx); // Flag as address_visited
2298     Node *adr = m->in(AddPNode::Address);
2299 
2300     // Intel can handle 2 adds in addressing mode
2301     // AtomicAdd is not an addressing expression.
2302     // Cheap to find it by looking for screwy base.
2303     if (adr->is_AddP() &&
2304         !adr->in(AddPNode::Base)->is_top() &&
2305         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
2306         // Are there other uses besides address expressions?
2307         !is_visited(adr)) {
2308       address_visited.set(adr->_idx); // Flag as address_visited
2309       Node *shift = adr->in(AddPNode::Offset);
2310       if (!clone_shift(shift, this, mstack, address_visited)) {
2311         mstack.push(shift, Pre_Visit);
2312       }
2313       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
2314       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
2315     } else {
2316       mstack.push(adr, Pre_Visit);
2317     }
2318 
2319     // Clone X+offset as it also folds into most addressing expressions
2320     mstack.push(off, Visit);
2321     mstack.push(m->in(AddPNode::Base), Pre_Visit);
2322     return true;
2323   } else if (clone_shift(off, this, mstack, address_visited)) {
2324     address_visited.test_set(m->_idx); // Flag as address_visited
2325     mstack.push(m->in(AddPNode::Address), Pre_Visit);
2326     mstack.push(m->in(AddPNode::Base), Pre_Visit);
2327     return true;
2328   }
2329   return false;
2330 }
2331 
2332 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
2333   switch (bt) {
2334     case BoolTest::eq:
2335       return Assembler::eq;
2336     case BoolTest::ne:
2337       return Assembler::neq;
2338     case BoolTest::le:
2339     case BoolTest::ule:
2340       return Assembler::le;
2341     case BoolTest::ge:
2342     case BoolTest::uge:
2343       return Assembler::nlt;
2344     case BoolTest::lt:
2345     case BoolTest::ult:
2346       return Assembler::lt;
2347     case BoolTest::gt:
2348     case BoolTest::ugt:
2349       return Assembler::nle;
2350     default : ShouldNotReachHere(); return Assembler::_false;
2351   }
2352 }
2353 
2354 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
2355   switch (bt) {
2356   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
2357   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
2358   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
2359   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
2360   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
2361   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
2362   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
2363   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
2364   }
2365 }
2366 
2367 // Helper methods for MachSpillCopyNode::implementation().
2368 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
2369                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
2370   assert(ireg == Op_VecS || // 32bit vector
2371          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
2372          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
2373          "no non-adjacent vector moves" );
2374   if (cbuf) {
2375     C2_MacroAssembler _masm(cbuf);
2376     switch (ireg) {
2377     case Op_VecS: // copy whole register
2378     case Op_VecD:
2379     case Op_VecX:
2380 #ifndef _LP64
2381       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2382 #else
2383       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2384         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2385       } else {
2386         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2387      }
2388 #endif
2389       break;
2390     case Op_VecY:
2391 #ifndef _LP64
2392       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2393 #else
2394       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2395         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2396       } else {
2397         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2398      }
2399 #endif
2400       break;
2401     case Op_VecZ:
2402       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
2403       break;
2404     default:
2405       ShouldNotReachHere();
2406     }
2407 #ifndef PRODUCT
2408   } else {
2409     switch (ireg) {
2410     case Op_VecS:
2411     case Op_VecD:
2412     case Op_VecX:
2413       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2414       break;
2415     case Op_VecY:
2416     case Op_VecZ:
2417       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2418       break;
2419     default:
2420       ShouldNotReachHere();
2421     }
2422 #endif
2423   }
2424 }
2425 
2426 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
2427                      int stack_offset, int reg, uint ireg, outputStream* st) {
2428   if (cbuf) {
2429     C2_MacroAssembler _masm(cbuf);
2430     if (is_load) {
2431       switch (ireg) {
2432       case Op_VecS:
2433         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2434         break;
2435       case Op_VecD:
2436         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2437         break;
2438       case Op_VecX:
2439 #ifndef _LP64
2440         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2441 #else
2442         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2443           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2444         } else {
2445           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2446           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2447         }
2448 #endif
2449         break;
2450       case Op_VecY:
2451 #ifndef _LP64
2452         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2453 #else
2454         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2455           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2456         } else {
2457           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2458           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2459         }
2460 #endif
2461         break;
2462       case Op_VecZ:
2463         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
2464         break;
2465       default:
2466         ShouldNotReachHere();
2467       }
2468     } else { // store
2469       switch (ireg) {
2470       case Op_VecS:
2471         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2472         break;
2473       case Op_VecD:
2474         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2475         break;
2476       case Op_VecX:
2477 #ifndef _LP64
2478         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2479 #else
2480         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2481           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2482         }
2483         else {
2484           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2485         }
2486 #endif
2487         break;
2488       case Op_VecY:
2489 #ifndef _LP64
2490         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2491 #else
2492         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2493           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2494         }
2495         else {
2496           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2497         }
2498 #endif
2499         break;
2500       case Op_VecZ:
2501         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2502         break;
2503       default:
2504         ShouldNotReachHere();
2505       }
2506     }
2507 #ifndef PRODUCT
2508   } else {
2509     if (is_load) {
2510       switch (ireg) {
2511       case Op_VecS:
2512         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2513         break;
2514       case Op_VecD:
2515         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2516         break;
2517        case Op_VecX:
2518         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2519         break;
2520       case Op_VecY:
2521       case Op_VecZ:
2522         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2523         break;
2524       default:
2525         ShouldNotReachHere();
2526       }
2527     } else { // store
2528       switch (ireg) {
2529       case Op_VecS:
2530         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2531         break;
2532       case Op_VecD:
2533         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2534         break;
2535        case Op_VecX:
2536         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2537         break;
2538       case Op_VecY:
2539       case Op_VecZ:
2540         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2541         break;
2542       default:
2543         ShouldNotReachHere();
2544       }
2545     }
2546 #endif
2547   }
2548 }
2549 
2550 static inline jlong replicate8_imm(int con, int width) {
2551   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
2552   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
2553   int bit_width = width * 8;
2554   jlong val = con;
2555   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
2556   while(bit_width < 64) {
2557     val |= (val << bit_width);
2558     bit_width <<= 1;
2559   }
2560   return val;
2561 }
2562 
2563 #ifndef PRODUCT
2564   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
2565     st->print("nop \t# %d bytes pad for loops and calls", _count);
2566   }
2567 #endif
2568 
2569   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
2570     C2_MacroAssembler _masm(&cbuf);
2571     __ nop(_count);
2572   }
2573 
2574   uint MachNopNode::size(PhaseRegAlloc*) const {
2575     return _count;
2576   }
2577 
2578 #ifndef PRODUCT
2579   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2580     st->print("# breakpoint");
2581   }
2582 #endif
2583 
2584   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
2585     C2_MacroAssembler _masm(&cbuf);
2586     __ int3();
2587   }
2588 
2589   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2590     return MachNode::size(ra_);
2591   }
2592 
2593 %}
2594 
2595 encode %{
2596 
2597   enc_class call_epilog %{
2598     C2_MacroAssembler _masm(&cbuf);
2599     if (VerifyStackAtCalls) {
2600       // Check that stack depth is unchanged: find majik cookie on stack
2601       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2602       Label L;
2603       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2604       __ jccb(Assembler::equal, L);
2605       // Die if stack mismatch
2606       __ int3();
2607       __ bind(L);
2608     }
2609     __ oopmap_metadata(-1);
2610   %}
2611 
2612 %}
2613 
2614 // Operands for bound floating pointer register arguments
2615 operand rxmm0() %{
2616   constraint(ALLOC_IN_RC(xmm0_reg));
2617   match(VecX);
2618   format%{%}
2619   interface(REG_INTER);
2620 %}
2621 
2622 //----------OPERANDS-----------------------------------------------------------
2623 // Operand definitions must precede instruction definitions for correct parsing
2624 // in the ADLC because operands constitute user defined types which are used in
2625 // instruction definitions.
2626 
2627 // Vectors
2628 
2629 // Dummy generic vector class. Should be used for all vector operands.
2630 // Replaced with vec[SDXYZ] during post-selection pass.
2631 operand vec() %{
2632   constraint(ALLOC_IN_RC(dynamic));
2633   match(VecX);
2634   match(VecY);
2635   match(VecZ);
2636   match(VecS);
2637   match(VecD);
2638 
2639   format %{ %}
2640   interface(REG_INTER);
2641 %}
2642 
2643 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
2644 // Replaced with legVec[SDXYZ] during post-selection cleanup.
2645 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
2646 // runtime code generation via reg_class_dynamic.
2647 operand legVec() %{
2648   constraint(ALLOC_IN_RC(dynamic));
2649   match(VecX);
2650   match(VecY);
2651   match(VecZ);
2652   match(VecS);
2653   match(VecD);
2654 
2655   format %{ %}
2656   interface(REG_INTER);
2657 %}
2658 
2659 // Replaces vec during post-selection cleanup. See above.
2660 operand vecS() %{
2661   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
2662   match(VecS);
2663 
2664   format %{ %}
2665   interface(REG_INTER);
2666 %}
2667 
2668 // Replaces legVec during post-selection cleanup. See above.
2669 operand legVecS() %{
2670   constraint(ALLOC_IN_RC(vectors_reg_legacy));
2671   match(VecS);
2672 
2673   format %{ %}
2674   interface(REG_INTER);
2675 %}
2676 
2677 // Replaces vec during post-selection cleanup. See above.
2678 operand vecD() %{
2679   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
2680   match(VecD);
2681 
2682   format %{ %}
2683   interface(REG_INTER);
2684 %}
2685 
2686 // Replaces legVec during post-selection cleanup. See above.
2687 operand legVecD() %{
2688   constraint(ALLOC_IN_RC(vectord_reg_legacy));
2689   match(VecD);
2690 
2691   format %{ %}
2692   interface(REG_INTER);
2693 %}
2694 
2695 // Replaces vec during post-selection cleanup. See above.
2696 operand vecX() %{
2697   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
2698   match(VecX);
2699 
2700   format %{ %}
2701   interface(REG_INTER);
2702 %}
2703 
2704 // Replaces legVec during post-selection cleanup. See above.
2705 operand legVecX() %{
2706   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
2707   match(VecX);
2708 
2709   format %{ %}
2710   interface(REG_INTER);
2711 %}
2712 
2713 // Replaces vec during post-selection cleanup. See above.
2714 operand vecY() %{
2715   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
2716   match(VecY);
2717 
2718   format %{ %}
2719   interface(REG_INTER);
2720 %}
2721 
2722 // Replaces legVec during post-selection cleanup. See above.
2723 operand legVecY() %{
2724   constraint(ALLOC_IN_RC(vectory_reg_legacy));
2725   match(VecY);
2726 
2727   format %{ %}
2728   interface(REG_INTER);
2729 %}
2730 
2731 // Replaces vec during post-selection cleanup. See above.
2732 operand vecZ() %{
2733   constraint(ALLOC_IN_RC(vectorz_reg));
2734   match(VecZ);
2735 
2736   format %{ %}
2737   interface(REG_INTER);
2738 %}
2739 
2740 // Replaces legVec during post-selection cleanup. See above.
2741 operand legVecZ() %{
2742   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
2743   match(VecZ);
2744 
2745   format %{ %}
2746   interface(REG_INTER);
2747 %}
2748 
2749 // Comparison Code for FP conditional move
2750 operand cmpOp_vcmppd() %{
2751   match(Bool);
2752 
2753   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2754             n->as_Bool()->_test._test != BoolTest::no_overflow);
2755   format %{ "" %}
2756   interface(COND_INTER) %{
2757     equal        (0x0, "eq");
2758     less         (0x1, "lt");
2759     less_equal   (0x2, "le");
2760     not_equal    (0xC, "ne");
2761     greater_equal(0xD, "ge");
2762     greater      (0xE, "gt");
2763     //TODO cannot compile (adlc breaks) without two next lines with error:
2764     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2765     // equal' for overflow.
2766     overflow     (0x20, "o");  // not really supported by the instruction
2767     no_overflow  (0x21, "no"); // not really supported by the instruction
2768   %}
2769 %}
2770 
2771 
2772 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2773 
2774 // ============================================================================
2775 
2776 instruct ShouldNotReachHere() %{
2777   match(Halt);
2778   format %{ "stop\t# ShouldNotReachHere" %}
2779   ins_encode %{
2780     if (is_reachable()) {
2781       __ stop(_halt_reason);
2782     }
2783   %}
2784   ins_pipe(pipe_slow);
2785 %}
2786 
2787 // =================================EVEX special===============================
2788 // Existing partial implementation for post-loop multi-versioning computes
2789 // the mask corresponding to tail loop in K1 opmask register. This may then be
2790 // used for predicating instructions in loop body during last post-loop iteration.
2791 // TODO: Remove hard-coded K1 usage while fixing existing post-loop
2792 // multiversioning support.
2793 instruct setMask(rRegI dst, rRegI src, kReg_K1 mask) %{
2794   predicate(PostLoopMultiversioning && Matcher::has_predicated_vectors());
2795   match(Set dst (SetVectMaskI  src));
2796   effect(TEMP dst);
2797   format %{ "setvectmask   $dst, $src" %}
2798   ins_encode %{
2799     __ setvectmask($dst$$Register, $src$$Register, $mask$$KRegister);
2800   %}
2801   ins_pipe(pipe_slow);
2802 %}
2803 
2804 // ============================================================================
2805 
2806 instruct addF_reg(regF dst, regF src) %{
2807   predicate((UseSSE>=1) && (UseAVX == 0));
2808   match(Set dst (AddF dst src));
2809 
2810   format %{ "addss   $dst, $src" %}
2811   ins_cost(150);
2812   ins_encode %{
2813     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2814   %}
2815   ins_pipe(pipe_slow);
2816 %}
2817 
2818 instruct addF_mem(regF dst, memory src) %{
2819   predicate((UseSSE>=1) && (UseAVX == 0));
2820   match(Set dst (AddF dst (LoadF src)));
2821 
2822   format %{ "addss   $dst, $src" %}
2823   ins_cost(150);
2824   ins_encode %{
2825     __ addss($dst$$XMMRegister, $src$$Address);
2826   %}
2827   ins_pipe(pipe_slow);
2828 %}
2829 
2830 instruct addF_imm(regF dst, immF con) %{
2831   predicate((UseSSE>=1) && (UseAVX == 0));
2832   match(Set dst (AddF dst con));
2833   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2834   ins_cost(150);
2835   ins_encode %{
2836     __ addss($dst$$XMMRegister, $constantaddress($con));
2837   %}
2838   ins_pipe(pipe_slow);
2839 %}
2840 
2841 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2842   predicate(UseAVX > 0);
2843   match(Set dst (AddF src1 src2));
2844 
2845   format %{ "vaddss  $dst, $src1, $src2" %}
2846   ins_cost(150);
2847   ins_encode %{
2848     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2849   %}
2850   ins_pipe(pipe_slow);
2851 %}
2852 
2853 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2854   predicate(UseAVX > 0);
2855   match(Set dst (AddF src1 (LoadF src2)));
2856 
2857   format %{ "vaddss  $dst, $src1, $src2" %}
2858   ins_cost(150);
2859   ins_encode %{
2860     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2861   %}
2862   ins_pipe(pipe_slow);
2863 %}
2864 
2865 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2866   predicate(UseAVX > 0);
2867   match(Set dst (AddF src con));
2868 
2869   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2870   ins_cost(150);
2871   ins_encode %{
2872     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2873   %}
2874   ins_pipe(pipe_slow);
2875 %}
2876 
2877 instruct addD_reg(regD dst, regD src) %{
2878   predicate((UseSSE>=2) && (UseAVX == 0));
2879   match(Set dst (AddD dst src));
2880 
2881   format %{ "addsd   $dst, $src" %}
2882   ins_cost(150);
2883   ins_encode %{
2884     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2885   %}
2886   ins_pipe(pipe_slow);
2887 %}
2888 
2889 instruct addD_mem(regD dst, memory src) %{
2890   predicate((UseSSE>=2) && (UseAVX == 0));
2891   match(Set dst (AddD dst (LoadD src)));
2892 
2893   format %{ "addsd   $dst, $src" %}
2894   ins_cost(150);
2895   ins_encode %{
2896     __ addsd($dst$$XMMRegister, $src$$Address);
2897   %}
2898   ins_pipe(pipe_slow);
2899 %}
2900 
2901 instruct addD_imm(regD dst, immD con) %{
2902   predicate((UseSSE>=2) && (UseAVX == 0));
2903   match(Set dst (AddD dst con));
2904   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2905   ins_cost(150);
2906   ins_encode %{
2907     __ addsd($dst$$XMMRegister, $constantaddress($con));
2908   %}
2909   ins_pipe(pipe_slow);
2910 %}
2911 
2912 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2913   predicate(UseAVX > 0);
2914   match(Set dst (AddD src1 src2));
2915 
2916   format %{ "vaddsd  $dst, $src1, $src2" %}
2917   ins_cost(150);
2918   ins_encode %{
2919     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2920   %}
2921   ins_pipe(pipe_slow);
2922 %}
2923 
2924 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2925   predicate(UseAVX > 0);
2926   match(Set dst (AddD src1 (LoadD src2)));
2927 
2928   format %{ "vaddsd  $dst, $src1, $src2" %}
2929   ins_cost(150);
2930   ins_encode %{
2931     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2932   %}
2933   ins_pipe(pipe_slow);
2934 %}
2935 
2936 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2937   predicate(UseAVX > 0);
2938   match(Set dst (AddD src con));
2939 
2940   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2941   ins_cost(150);
2942   ins_encode %{
2943     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2944   %}
2945   ins_pipe(pipe_slow);
2946 %}
2947 
2948 instruct subF_reg(regF dst, regF src) %{
2949   predicate((UseSSE>=1) && (UseAVX == 0));
2950   match(Set dst (SubF dst src));
2951 
2952   format %{ "subss   $dst, $src" %}
2953   ins_cost(150);
2954   ins_encode %{
2955     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2956   %}
2957   ins_pipe(pipe_slow);
2958 %}
2959 
2960 instruct subF_mem(regF dst, memory src) %{
2961   predicate((UseSSE>=1) && (UseAVX == 0));
2962   match(Set dst (SubF dst (LoadF src)));
2963 
2964   format %{ "subss   $dst, $src" %}
2965   ins_cost(150);
2966   ins_encode %{
2967     __ subss($dst$$XMMRegister, $src$$Address);
2968   %}
2969   ins_pipe(pipe_slow);
2970 %}
2971 
2972 instruct subF_imm(regF dst, immF con) %{
2973   predicate((UseSSE>=1) && (UseAVX == 0));
2974   match(Set dst (SubF dst con));
2975   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2976   ins_cost(150);
2977   ins_encode %{
2978     __ subss($dst$$XMMRegister, $constantaddress($con));
2979   %}
2980   ins_pipe(pipe_slow);
2981 %}
2982 
2983 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2984   predicate(UseAVX > 0);
2985   match(Set dst (SubF src1 src2));
2986 
2987   format %{ "vsubss  $dst, $src1, $src2" %}
2988   ins_cost(150);
2989   ins_encode %{
2990     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2991   %}
2992   ins_pipe(pipe_slow);
2993 %}
2994 
2995 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2996   predicate(UseAVX > 0);
2997   match(Set dst (SubF src1 (LoadF src2)));
2998 
2999   format %{ "vsubss  $dst, $src1, $src2" %}
3000   ins_cost(150);
3001   ins_encode %{
3002     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3003   %}
3004   ins_pipe(pipe_slow);
3005 %}
3006 
3007 instruct subF_reg_imm(regF dst, regF src, immF con) %{
3008   predicate(UseAVX > 0);
3009   match(Set dst (SubF src con));
3010 
3011   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3012   ins_cost(150);
3013   ins_encode %{
3014     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3015   %}
3016   ins_pipe(pipe_slow);
3017 %}
3018 
3019 instruct subD_reg(regD dst, regD src) %{
3020   predicate((UseSSE>=2) && (UseAVX == 0));
3021   match(Set dst (SubD dst src));
3022 
3023   format %{ "subsd   $dst, $src" %}
3024   ins_cost(150);
3025   ins_encode %{
3026     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
3027   %}
3028   ins_pipe(pipe_slow);
3029 %}
3030 
3031 instruct subD_mem(regD dst, memory src) %{
3032   predicate((UseSSE>=2) && (UseAVX == 0));
3033   match(Set dst (SubD dst (LoadD src)));
3034 
3035   format %{ "subsd   $dst, $src" %}
3036   ins_cost(150);
3037   ins_encode %{
3038     __ subsd($dst$$XMMRegister, $src$$Address);
3039   %}
3040   ins_pipe(pipe_slow);
3041 %}
3042 
3043 instruct subD_imm(regD dst, immD con) %{
3044   predicate((UseSSE>=2) && (UseAVX == 0));
3045   match(Set dst (SubD dst con));
3046   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3047   ins_cost(150);
3048   ins_encode %{
3049     __ subsd($dst$$XMMRegister, $constantaddress($con));
3050   %}
3051   ins_pipe(pipe_slow);
3052 %}
3053 
3054 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
3055   predicate(UseAVX > 0);
3056   match(Set dst (SubD src1 src2));
3057 
3058   format %{ "vsubsd  $dst, $src1, $src2" %}
3059   ins_cost(150);
3060   ins_encode %{
3061     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3062   %}
3063   ins_pipe(pipe_slow);
3064 %}
3065 
3066 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
3067   predicate(UseAVX > 0);
3068   match(Set dst (SubD src1 (LoadD src2)));
3069 
3070   format %{ "vsubsd  $dst, $src1, $src2" %}
3071   ins_cost(150);
3072   ins_encode %{
3073     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3074   %}
3075   ins_pipe(pipe_slow);
3076 %}
3077 
3078 instruct subD_reg_imm(regD dst, regD src, immD con) %{
3079   predicate(UseAVX > 0);
3080   match(Set dst (SubD src con));
3081 
3082   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3083   ins_cost(150);
3084   ins_encode %{
3085     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3086   %}
3087   ins_pipe(pipe_slow);
3088 %}
3089 
3090 instruct mulF_reg(regF dst, regF src) %{
3091   predicate((UseSSE>=1) && (UseAVX == 0));
3092   match(Set dst (MulF dst src));
3093 
3094   format %{ "mulss   $dst, $src" %}
3095   ins_cost(150);
3096   ins_encode %{
3097     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
3098   %}
3099   ins_pipe(pipe_slow);
3100 %}
3101 
3102 instruct mulF_mem(regF dst, memory src) %{
3103   predicate((UseSSE>=1) && (UseAVX == 0));
3104   match(Set dst (MulF dst (LoadF src)));
3105 
3106   format %{ "mulss   $dst, $src" %}
3107   ins_cost(150);
3108   ins_encode %{
3109     __ mulss($dst$$XMMRegister, $src$$Address);
3110   %}
3111   ins_pipe(pipe_slow);
3112 %}
3113 
3114 instruct mulF_imm(regF dst, immF con) %{
3115   predicate((UseSSE>=1) && (UseAVX == 0));
3116   match(Set dst (MulF dst con));
3117   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3118   ins_cost(150);
3119   ins_encode %{
3120     __ mulss($dst$$XMMRegister, $constantaddress($con));
3121   %}
3122   ins_pipe(pipe_slow);
3123 %}
3124 
3125 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
3126   predicate(UseAVX > 0);
3127   match(Set dst (MulF src1 src2));
3128 
3129   format %{ "vmulss  $dst, $src1, $src2" %}
3130   ins_cost(150);
3131   ins_encode %{
3132     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3133   %}
3134   ins_pipe(pipe_slow);
3135 %}
3136 
3137 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
3138   predicate(UseAVX > 0);
3139   match(Set dst (MulF src1 (LoadF src2)));
3140 
3141   format %{ "vmulss  $dst, $src1, $src2" %}
3142   ins_cost(150);
3143   ins_encode %{
3144     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3145   %}
3146   ins_pipe(pipe_slow);
3147 %}
3148 
3149 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
3150   predicate(UseAVX > 0);
3151   match(Set dst (MulF src con));
3152 
3153   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3154   ins_cost(150);
3155   ins_encode %{
3156     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3157   %}
3158   ins_pipe(pipe_slow);
3159 %}
3160 
3161 instruct mulD_reg(regD dst, regD src) %{
3162   predicate((UseSSE>=2) && (UseAVX == 0));
3163   match(Set dst (MulD dst src));
3164 
3165   format %{ "mulsd   $dst, $src" %}
3166   ins_cost(150);
3167   ins_encode %{
3168     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
3169   %}
3170   ins_pipe(pipe_slow);
3171 %}
3172 
3173 instruct mulD_mem(regD dst, memory src) %{
3174   predicate((UseSSE>=2) && (UseAVX == 0));
3175   match(Set dst (MulD dst (LoadD src)));
3176 
3177   format %{ "mulsd   $dst, $src" %}
3178   ins_cost(150);
3179   ins_encode %{
3180     __ mulsd($dst$$XMMRegister, $src$$Address);
3181   %}
3182   ins_pipe(pipe_slow);
3183 %}
3184 
3185 instruct mulD_imm(regD dst, immD con) %{
3186   predicate((UseSSE>=2) && (UseAVX == 0));
3187   match(Set dst (MulD dst con));
3188   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3189   ins_cost(150);
3190   ins_encode %{
3191     __ mulsd($dst$$XMMRegister, $constantaddress($con));
3192   %}
3193   ins_pipe(pipe_slow);
3194 %}
3195 
3196 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
3197   predicate(UseAVX > 0);
3198   match(Set dst (MulD src1 src2));
3199 
3200   format %{ "vmulsd  $dst, $src1, $src2" %}
3201   ins_cost(150);
3202   ins_encode %{
3203     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3204   %}
3205   ins_pipe(pipe_slow);
3206 %}
3207 
3208 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
3209   predicate(UseAVX > 0);
3210   match(Set dst (MulD src1 (LoadD src2)));
3211 
3212   format %{ "vmulsd  $dst, $src1, $src2" %}
3213   ins_cost(150);
3214   ins_encode %{
3215     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3216   %}
3217   ins_pipe(pipe_slow);
3218 %}
3219 
3220 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
3221   predicate(UseAVX > 0);
3222   match(Set dst (MulD src con));
3223 
3224   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3225   ins_cost(150);
3226   ins_encode %{
3227     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3228   %}
3229   ins_pipe(pipe_slow);
3230 %}
3231 
3232 instruct divF_reg(regF dst, regF src) %{
3233   predicate((UseSSE>=1) && (UseAVX == 0));
3234   match(Set dst (DivF dst src));
3235 
3236   format %{ "divss   $dst, $src" %}
3237   ins_cost(150);
3238   ins_encode %{
3239     __ divss($dst$$XMMRegister, $src$$XMMRegister);
3240   %}
3241   ins_pipe(pipe_slow);
3242 %}
3243 
3244 instruct divF_mem(regF dst, memory src) %{
3245   predicate((UseSSE>=1) && (UseAVX == 0));
3246   match(Set dst (DivF dst (LoadF src)));
3247 
3248   format %{ "divss   $dst, $src" %}
3249   ins_cost(150);
3250   ins_encode %{
3251     __ divss($dst$$XMMRegister, $src$$Address);
3252   %}
3253   ins_pipe(pipe_slow);
3254 %}
3255 
3256 instruct divF_imm(regF dst, immF con) %{
3257   predicate((UseSSE>=1) && (UseAVX == 0));
3258   match(Set dst (DivF dst con));
3259   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3260   ins_cost(150);
3261   ins_encode %{
3262     __ divss($dst$$XMMRegister, $constantaddress($con));
3263   %}
3264   ins_pipe(pipe_slow);
3265 %}
3266 
3267 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
3268   predicate(UseAVX > 0);
3269   match(Set dst (DivF src1 src2));
3270 
3271   format %{ "vdivss  $dst, $src1, $src2" %}
3272   ins_cost(150);
3273   ins_encode %{
3274     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3275   %}
3276   ins_pipe(pipe_slow);
3277 %}
3278 
3279 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
3280   predicate(UseAVX > 0);
3281   match(Set dst (DivF src1 (LoadF src2)));
3282 
3283   format %{ "vdivss  $dst, $src1, $src2" %}
3284   ins_cost(150);
3285   ins_encode %{
3286     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3287   %}
3288   ins_pipe(pipe_slow);
3289 %}
3290 
3291 instruct divF_reg_imm(regF dst, regF src, immF con) %{
3292   predicate(UseAVX > 0);
3293   match(Set dst (DivF src con));
3294 
3295   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3296   ins_cost(150);
3297   ins_encode %{
3298     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3299   %}
3300   ins_pipe(pipe_slow);
3301 %}
3302 
3303 instruct divD_reg(regD dst, regD src) %{
3304   predicate((UseSSE>=2) && (UseAVX == 0));
3305   match(Set dst (DivD dst src));
3306 
3307   format %{ "divsd   $dst, $src" %}
3308   ins_cost(150);
3309   ins_encode %{
3310     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
3311   %}
3312   ins_pipe(pipe_slow);
3313 %}
3314 
3315 instruct divD_mem(regD dst, memory src) %{
3316   predicate((UseSSE>=2) && (UseAVX == 0));
3317   match(Set dst (DivD dst (LoadD src)));
3318 
3319   format %{ "divsd   $dst, $src" %}
3320   ins_cost(150);
3321   ins_encode %{
3322     __ divsd($dst$$XMMRegister, $src$$Address);
3323   %}
3324   ins_pipe(pipe_slow);
3325 %}
3326 
3327 instruct divD_imm(regD dst, immD con) %{
3328   predicate((UseSSE>=2) && (UseAVX == 0));
3329   match(Set dst (DivD dst con));
3330   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3331   ins_cost(150);
3332   ins_encode %{
3333     __ divsd($dst$$XMMRegister, $constantaddress($con));
3334   %}
3335   ins_pipe(pipe_slow);
3336 %}
3337 
3338 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
3339   predicate(UseAVX > 0);
3340   match(Set dst (DivD src1 src2));
3341 
3342   format %{ "vdivsd  $dst, $src1, $src2" %}
3343   ins_cost(150);
3344   ins_encode %{
3345     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3346   %}
3347   ins_pipe(pipe_slow);
3348 %}
3349 
3350 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
3351   predicate(UseAVX > 0);
3352   match(Set dst (DivD src1 (LoadD src2)));
3353 
3354   format %{ "vdivsd  $dst, $src1, $src2" %}
3355   ins_cost(150);
3356   ins_encode %{
3357     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3358   %}
3359   ins_pipe(pipe_slow);
3360 %}
3361 
3362 instruct divD_reg_imm(regD dst, regD src, immD con) %{
3363   predicate(UseAVX > 0);
3364   match(Set dst (DivD src con));
3365 
3366   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3367   ins_cost(150);
3368   ins_encode %{
3369     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3370   %}
3371   ins_pipe(pipe_slow);
3372 %}
3373 
3374 instruct absF_reg(regF dst) %{
3375   predicate((UseSSE>=1) && (UseAVX == 0));
3376   match(Set dst (AbsF dst));
3377   ins_cost(150);
3378   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
3379   ins_encode %{
3380     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
3381   %}
3382   ins_pipe(pipe_slow);
3383 %}
3384 
3385 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
3386   predicate(UseAVX > 0);
3387   match(Set dst (AbsF src));
3388   ins_cost(150);
3389   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
3390   ins_encode %{
3391     int vlen_enc = Assembler::AVX_128bit;
3392     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
3393               ExternalAddress(float_signmask()), vlen_enc);
3394   %}
3395   ins_pipe(pipe_slow);
3396 %}
3397 
3398 instruct absD_reg(regD dst) %{
3399   predicate((UseSSE>=2) && (UseAVX == 0));
3400   match(Set dst (AbsD dst));
3401   ins_cost(150);
3402   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
3403             "# abs double by sign masking" %}
3404   ins_encode %{
3405     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
3406   %}
3407   ins_pipe(pipe_slow);
3408 %}
3409 
3410 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
3411   predicate(UseAVX > 0);
3412   match(Set dst (AbsD src));
3413   ins_cost(150);
3414   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
3415             "# abs double by sign masking" %}
3416   ins_encode %{
3417     int vlen_enc = Assembler::AVX_128bit;
3418     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
3419               ExternalAddress(double_signmask()), vlen_enc);
3420   %}
3421   ins_pipe(pipe_slow);
3422 %}
3423 
3424 instruct negF_reg(regF dst) %{
3425   predicate((UseSSE>=1) && (UseAVX == 0));
3426   match(Set dst (NegF dst));
3427   ins_cost(150);
3428   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
3429   ins_encode %{
3430     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
3431   %}
3432   ins_pipe(pipe_slow);
3433 %}
3434 
3435 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
3436   predicate(UseAVX > 0);
3437   match(Set dst (NegF src));
3438   ins_cost(150);
3439   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
3440   ins_encode %{
3441     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
3442                  ExternalAddress(float_signflip()));
3443   %}
3444   ins_pipe(pipe_slow);
3445 %}
3446 
3447 instruct negD_reg(regD dst) %{
3448   predicate((UseSSE>=2) && (UseAVX == 0));
3449   match(Set dst (NegD dst));
3450   ins_cost(150);
3451   format %{ "xorpd   $dst, [0x8000000000000000]\t"
3452             "# neg double by sign flipping" %}
3453   ins_encode %{
3454     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
3455   %}
3456   ins_pipe(pipe_slow);
3457 %}
3458 
3459 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
3460   predicate(UseAVX > 0);
3461   match(Set dst (NegD src));
3462   ins_cost(150);
3463   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
3464             "# neg double by sign flipping" %}
3465   ins_encode %{
3466     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
3467                  ExternalAddress(double_signflip()));
3468   %}
3469   ins_pipe(pipe_slow);
3470 %}
3471 
3472 // sqrtss instruction needs destination register to be pre initialized for best performance
3473 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3474 instruct sqrtF_reg(regF dst) %{
3475   predicate(UseSSE>=1);
3476   match(Set dst (SqrtF dst));
3477   format %{ "sqrtss  $dst, $dst" %}
3478   ins_encode %{
3479     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
3480   %}
3481   ins_pipe(pipe_slow);
3482 %}
3483 
3484 // sqrtsd instruction needs destination register to be pre initialized for best performance
3485 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3486 instruct sqrtD_reg(regD dst) %{
3487   predicate(UseSSE>=2);
3488   match(Set dst (SqrtD dst));
3489   format %{ "sqrtsd  $dst, $dst" %}
3490   ins_encode %{
3491     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
3492   %}
3493   ins_pipe(pipe_slow);
3494 %}
3495 
3496 
3497 // ---------------------------------------- VectorReinterpret ------------------------------------
3498 instruct reinterpret_mask(kReg dst) %{
3499   predicate(n->bottom_type()->isa_vectmask() &&
3500             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
3501   match(Set dst (VectorReinterpret dst));
3502   ins_cost(125);
3503   format %{ "vector_reinterpret $dst\t!" %}
3504   ins_encode %{
3505     // empty
3506   %}
3507   ins_pipe( pipe_slow );
3508 %}
3509 
3510 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
3511   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3512             n->bottom_type()->isa_vectmask() &&
3513             n->in(1)->bottom_type()->isa_vectmask() &&
3514             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
3515             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3516   match(Set dst (VectorReinterpret src));
3517   effect(TEMP xtmp);
3518   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
3519   ins_encode %{
3520      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
3521      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3522      assert(src_sz == dst_sz , "src and dst size mismatch");
3523      int vlen_enc = vector_length_encoding(src_sz);
3524      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3525      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3526   %}
3527   ins_pipe( pipe_slow );
3528 %}
3529 
3530 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
3531   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3532             n->bottom_type()->isa_vectmask() &&
3533             n->in(1)->bottom_type()->isa_vectmask() &&
3534             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
3535              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
3536             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3537   match(Set dst (VectorReinterpret src));
3538   effect(TEMP xtmp);
3539   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
3540   ins_encode %{
3541      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
3542      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3543      assert(src_sz == dst_sz , "src and dst size mismatch");
3544      int vlen_enc = vector_length_encoding(src_sz);
3545      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3546      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3547   %}
3548   ins_pipe( pipe_slow );
3549 %}
3550 
3551 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
3552   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3553             n->bottom_type()->isa_vectmask() &&
3554             n->in(1)->bottom_type()->isa_vectmask() &&
3555             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
3556              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
3557             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3558   match(Set dst (VectorReinterpret src));
3559   effect(TEMP xtmp);
3560   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
3561   ins_encode %{
3562      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
3563      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3564      assert(src_sz == dst_sz , "src and dst size mismatch");
3565      int vlen_enc = vector_length_encoding(src_sz);
3566      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3567      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3568   %}
3569   ins_pipe( pipe_slow );
3570 %}
3571 
3572 instruct reinterpret(vec dst) %{
3573   predicate(!n->bottom_type()->isa_vectmask() &&
3574             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
3575   match(Set dst (VectorReinterpret dst));
3576   ins_cost(125);
3577   format %{ "vector_reinterpret $dst\t!" %}
3578   ins_encode %{
3579     // empty
3580   %}
3581   ins_pipe( pipe_slow );
3582 %}
3583 
3584 instruct reinterpret_expand(vec dst, vec src, rRegP scratch) %{
3585   predicate(UseAVX == 0 &&
3586             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3587   match(Set dst (VectorReinterpret src));
3588   ins_cost(125);
3589   effect(TEMP dst, TEMP scratch);
3590   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3591   ins_encode %{
3592     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
3593     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
3594 
3595     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
3596     if (src_vlen_in_bytes == 4) {
3597       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3598     } else {
3599       assert(src_vlen_in_bytes == 8, "");
3600       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), $scratch$$Register);
3601     }
3602     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3603   %}
3604   ins_pipe( pipe_slow );
3605 %}
3606 
3607 instruct vreinterpret_expand4(legVec dst, vec src, rRegP scratch) %{
3608   predicate(UseAVX > 0 &&
3609             !n->bottom_type()->isa_vectmask() &&
3610             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
3611             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3612   match(Set dst (VectorReinterpret src));
3613   ins_cost(125);
3614   effect(TEMP scratch);
3615   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3616   ins_encode %{
3617     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, $scratch$$Register);
3618   %}
3619   ins_pipe( pipe_slow );
3620 %}
3621 
3622 
3623 instruct vreinterpret_expand(legVec dst, vec src) %{
3624   predicate(UseAVX > 0 &&
3625             !n->bottom_type()->isa_vectmask() &&
3626             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
3627             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3628   match(Set dst (VectorReinterpret src));
3629   ins_cost(125);
3630   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
3631   ins_encode %{
3632     switch (Matcher::vector_length_in_bytes(this, $src)) {
3633       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3634       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3635       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3636       default: ShouldNotReachHere();
3637     }
3638   %}
3639   ins_pipe( pipe_slow );
3640 %}
3641 
3642 instruct reinterpret_shrink(vec dst, legVec src) %{
3643   predicate(!n->bottom_type()->isa_vectmask() &&
3644             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
3645   match(Set dst (VectorReinterpret src));
3646   ins_cost(125);
3647   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
3648   ins_encode %{
3649     switch (Matcher::vector_length_in_bytes(this)) {
3650       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
3651       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3652       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3653       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3654       default: ShouldNotReachHere();
3655     }
3656   %}
3657   ins_pipe( pipe_slow );
3658 %}
3659 
3660 // ----------------------------------------------------------------------------------------------------
3661 
3662 #ifdef _LP64
3663 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
3664   match(Set dst (RoundDoubleMode src rmode));
3665   format %{ "roundsd $dst,$src" %}
3666   ins_cost(150);
3667   ins_encode %{
3668     assert(UseSSE >= 4, "required");
3669     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
3670   %}
3671   ins_pipe(pipe_slow);
3672 %}
3673 
3674 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
3675   match(Set dst (RoundDoubleMode (LoadD src) rmode));
3676   format %{ "roundsd $dst,$src" %}
3677   ins_cost(150);
3678   ins_encode %{
3679     assert(UseSSE >= 4, "required");
3680     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
3681   %}
3682   ins_pipe(pipe_slow);
3683 %}
3684 
3685 instruct roundD_imm(legRegD dst, immD con, immU8 rmode, rRegI scratch_reg) %{
3686   match(Set dst (RoundDoubleMode con rmode));
3687   effect(TEMP scratch_reg);
3688   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
3689   ins_cost(150);
3690   ins_encode %{
3691     assert(UseSSE >= 4, "required");
3692     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, $scratch_reg$$Register);
3693   %}
3694   ins_pipe(pipe_slow);
3695 %}
3696 
3697 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
3698   predicate(Matcher::vector_length(n) < 8);
3699   match(Set dst (RoundDoubleModeV src rmode));
3700   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
3701   ins_encode %{
3702     assert(UseAVX > 0, "required");
3703     int vlen_enc = vector_length_encoding(this);
3704     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
3705   %}
3706   ins_pipe( pipe_slow );
3707 %}
3708 
3709 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
3710   predicate(Matcher::vector_length(n) == 8);
3711   match(Set dst (RoundDoubleModeV src rmode));
3712   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
3713   ins_encode %{
3714     assert(UseAVX > 2, "required");
3715     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
3716   %}
3717   ins_pipe( pipe_slow );
3718 %}
3719 
3720 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
3721   predicate(Matcher::vector_length(n) < 8);
3722   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3723   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
3724   ins_encode %{
3725     assert(UseAVX > 0, "required");
3726     int vlen_enc = vector_length_encoding(this);
3727     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
3728   %}
3729   ins_pipe( pipe_slow );
3730 %}
3731 
3732 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
3733   predicate(Matcher::vector_length(n) == 8);
3734   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3735   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
3736   ins_encode %{
3737     assert(UseAVX > 2, "required");
3738     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
3739   %}
3740   ins_pipe( pipe_slow );
3741 %}
3742 #endif // _LP64
3743 
3744 instruct onspinwait() %{
3745   match(OnSpinWait);
3746   ins_cost(200);
3747 
3748   format %{
3749     $$template
3750     $$emit$$"pause\t! membar_onspinwait"
3751   %}
3752   ins_encode %{
3753     __ pause();
3754   %}
3755   ins_pipe(pipe_slow);
3756 %}
3757 
3758 // a * b + c
3759 instruct fmaD_reg(regD a, regD b, regD c) %{
3760   predicate(UseFMA);
3761   match(Set c (FmaD  c (Binary a b)));
3762   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
3763   ins_cost(150);
3764   ins_encode %{
3765     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3766   %}
3767   ins_pipe( pipe_slow );
3768 %}
3769 
3770 // a * b + c
3771 instruct fmaF_reg(regF a, regF b, regF c) %{
3772   predicate(UseFMA);
3773   match(Set c (FmaF  c (Binary a b)));
3774   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
3775   ins_cost(150);
3776   ins_encode %{
3777     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3778   %}
3779   ins_pipe( pipe_slow );
3780 %}
3781 
3782 // ====================VECTOR INSTRUCTIONS=====================================
3783 
3784 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
3785 instruct MoveVec2Leg(legVec dst, vec src) %{
3786   match(Set dst src);
3787   format %{ "" %}
3788   ins_encode %{
3789     ShouldNotReachHere();
3790   %}
3791   ins_pipe( fpu_reg_reg );
3792 %}
3793 
3794 instruct MoveLeg2Vec(vec dst, legVec src) %{
3795   match(Set dst src);
3796   format %{ "" %}
3797   ins_encode %{
3798     ShouldNotReachHere();
3799   %}
3800   ins_pipe( fpu_reg_reg );
3801 %}
3802 
3803 // ============================================================================
3804 
3805 // Load vectors generic operand pattern
3806 instruct loadV(vec dst, memory mem) %{
3807   match(Set dst (LoadVector mem));
3808   ins_cost(125);
3809   format %{ "load_vector $dst,$mem" %}
3810   ins_encode %{
3811     switch (Matcher::vector_length_in_bytes(this)) {
3812       case  4: __ movdl    ($dst$$XMMRegister, $mem$$Address); break;
3813       case  8: __ movq     ($dst$$XMMRegister, $mem$$Address); break;
3814       case 16: __ movdqu   ($dst$$XMMRegister, $mem$$Address); break;
3815       case 32: __ vmovdqu  ($dst$$XMMRegister, $mem$$Address); break;
3816       case 64: __ evmovdqul($dst$$XMMRegister, $mem$$Address, Assembler::AVX_512bit); break;
3817       default: ShouldNotReachHere();
3818     }
3819   %}
3820   ins_pipe( pipe_slow );
3821 %}
3822 
3823 // Store vectors generic operand pattern.
3824 instruct storeV(memory mem, vec src) %{
3825   match(Set mem (StoreVector mem src));
3826   ins_cost(145);
3827   format %{ "store_vector $mem,$src\n\t" %}
3828   ins_encode %{
3829     switch (Matcher::vector_length_in_bytes(this, $src)) {
3830       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
3831       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
3832       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
3833       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
3834       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
3835       default: ShouldNotReachHere();
3836     }
3837   %}
3838   ins_pipe( pipe_slow );
3839 %}
3840 
3841 // ---------------------------------------- Gather ------------------------------------
3842 
3843 // Gather INT, LONG, FLOAT, DOUBLE
3844 
3845 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
3846   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
3847   match(Set dst (LoadVectorGather mem idx));
3848   effect(TEMP dst, TEMP tmp, TEMP mask);
3849   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
3850   ins_encode %{
3851     assert(UseAVX >= 2, "sanity");
3852 
3853     int vlen_enc = vector_length_encoding(this);
3854     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3855 
3856     assert(Matcher::vector_length_in_bytes(this) >= 16, "sanity");
3857     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3858 
3859     if (vlen_enc == Assembler::AVX_128bit) {
3860       __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3861     } else {
3862       __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3863     }
3864     __ lea($tmp$$Register, $mem$$Address);
3865     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
3866   %}
3867   ins_pipe( pipe_slow );
3868 %}
3869 
3870 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
3871   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
3872   match(Set dst (LoadVectorGather mem idx));
3873   effect(TEMP dst, TEMP tmp, TEMP ktmp);
3874   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
3875   ins_encode %{
3876     assert(UseAVX > 2, "sanity");
3877 
3878     int vlen_enc = vector_length_encoding(this);
3879     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3880 
3881     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3882 
3883     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3884     __ lea($tmp$$Register, $mem$$Address);
3885     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
3886   %}
3887   ins_pipe( pipe_slow );
3888 %}
3889 
3890 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
3891   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
3892   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
3893   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
3894   ins_encode %{
3895     assert(UseAVX > 2, "sanity");
3896     int vlen_enc = vector_length_encoding(this);
3897     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3898     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3899     // Note: Since gather instruction partially updates the opmask register used
3900     // for predication hense moving mask operand to a temporary.
3901     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
3902     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3903     __ lea($tmp$$Register, $mem$$Address);
3904     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
3905   %}
3906   ins_pipe( pipe_slow );
3907 %}
3908 // ====================Scatter=======================================
3909 
3910 // Scatter INT, LONG, FLOAT, DOUBLE
3911 
3912 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
3913   predicate(UseAVX > 2);
3914   match(Set mem (StoreVectorScatter mem (Binary src idx)));
3915   effect(TEMP tmp, TEMP ktmp);
3916   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
3917   ins_encode %{
3918     int vlen_enc = vector_length_encoding(this, $src);
3919     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
3920 
3921     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
3922     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3923 
3924     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3925     __ lea($tmp$$Register, $mem$$Address);
3926     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
3927   %}
3928   ins_pipe( pipe_slow );
3929 %}
3930 
3931 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
3932   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
3933   effect(TEMP tmp, TEMP ktmp);
3934   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
3935   ins_encode %{
3936     int vlen_enc = vector_length_encoding(this, $src);
3937     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
3938     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
3939     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3940     // Note: Since scatter instruction partially updates the opmask register used
3941     // for predication hense moving mask operand to a temporary.
3942     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
3943     __ lea($tmp$$Register, $mem$$Address);
3944     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
3945   %}
3946   ins_pipe( pipe_slow );
3947 %}
3948 
3949 // ====================REPLICATE=======================================
3950 
3951 // Replicate byte scalar to be vector
3952 instruct ReplB_reg(vec dst, rRegI src) %{
3953   match(Set dst (ReplicateB src));
3954   format %{ "replicateB $dst,$src" %}
3955   ins_encode %{
3956     uint vlen = Matcher::vector_length(this);
3957     if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
3958       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
3959       int vlen_enc = vector_length_encoding(this);
3960       __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
3961     } else if (VM_Version::supports_avx2()) {
3962       int vlen_enc = vector_length_encoding(this);
3963       __ movdl($dst$$XMMRegister, $src$$Register);
3964       __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3965     } else {
3966       __ movdl($dst$$XMMRegister, $src$$Register);
3967       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3968       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3969       if (vlen >= 16) {
3970         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3971         if (vlen >= 32) {
3972           assert(vlen == 32, "sanity");
3973           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3974         }
3975       }
3976     }
3977   %}
3978   ins_pipe( pipe_slow );
3979 %}
3980 
3981 instruct ReplB_mem(vec dst, memory mem) %{
3982   predicate(VM_Version::supports_avx2());
3983   match(Set dst (ReplicateB (LoadB mem)));
3984   format %{ "replicateB $dst,$mem" %}
3985   ins_encode %{
3986     int vlen_enc = vector_length_encoding(this);
3987     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
3988   %}
3989   ins_pipe( pipe_slow );
3990 %}
3991 
3992 instruct ReplB_imm(vec dst, immI con) %{
3993   match(Set dst (ReplicateB con));
3994   format %{ "replicateB $dst,$con" %}
3995   ins_encode %{
3996     uint vlen = Matcher::vector_length(this);
3997     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 1));
3998     if (vlen == 4) {
3999       __ movdl($dst$$XMMRegister, const_addr);
4000     } else {
4001       __ movq($dst$$XMMRegister, const_addr);
4002       if (vlen >= 16) {
4003         if (VM_Version::supports_avx2()) {
4004           int vlen_enc = vector_length_encoding(this);
4005           __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4006         } else {
4007           assert(vlen == 16, "sanity");
4008           __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4009         }
4010       }
4011     }
4012   %}
4013   ins_pipe( pipe_slow );
4014 %}
4015 
4016 // Replicate byte scalar zero to be vector
4017 instruct ReplB_zero(vec dst, immI_0 zero) %{
4018   match(Set dst (ReplicateB zero));
4019   format %{ "replicateB $dst,$zero" %}
4020   ins_encode %{
4021     uint vlen = Matcher::vector_length(this);
4022     if (vlen <= 16) {
4023       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4024     } else {
4025       // Use vpxor since AVX512F does not have 512bit vxorpd (requires AVX512DQ).
4026       int vlen_enc = vector_length_encoding(this);
4027       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4028     }
4029   %}
4030   ins_pipe( fpu_reg_reg );
4031 %}
4032 
4033 // ====================ReplicateS=======================================
4034 
4035 instruct ReplS_reg(vec dst, rRegI src) %{
4036   match(Set dst (ReplicateS src));
4037   format %{ "replicateS $dst,$src" %}
4038   ins_encode %{
4039     uint vlen = Matcher::vector_length(this);
4040     if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
4041       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
4042       int vlen_enc = vector_length_encoding(this);
4043       __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
4044     } else if (VM_Version::supports_avx2()) {
4045       int vlen_enc = vector_length_encoding(this);
4046       __ movdl($dst$$XMMRegister, $src$$Register);
4047       __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4048     } else {
4049       __ movdl($dst$$XMMRegister, $src$$Register);
4050       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4051       if (vlen >= 8) {
4052         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4053         if (vlen >= 16) {
4054           assert(vlen == 16, "sanity");
4055           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4056         }
4057       }
4058     }
4059   %}
4060   ins_pipe( pipe_slow );
4061 %}
4062 
4063 instruct ReplS_mem(vec dst, memory mem) %{
4064   predicate(VM_Version::supports_avx2());
4065   match(Set dst (ReplicateS (LoadS mem)));
4066   format %{ "replicateS $dst,$mem" %}
4067   ins_encode %{
4068     int vlen_enc = vector_length_encoding(this);
4069     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
4070   %}
4071   ins_pipe( pipe_slow );
4072 %}
4073 
4074 instruct ReplS_imm(vec dst, immI con) %{
4075   match(Set dst (ReplicateS con));
4076   format %{ "replicateS $dst,$con" %}
4077   ins_encode %{
4078     uint vlen = Matcher::vector_length(this);
4079     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 2));
4080     if (vlen == 2) {
4081       __ movdl($dst$$XMMRegister, const_addr);
4082     } else {
4083       __ movq($dst$$XMMRegister, const_addr);
4084       if (vlen >= 8) {
4085         if (VM_Version::supports_avx2()) {
4086           int vlen_enc = vector_length_encoding(this);
4087           __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4088         } else {
4089           assert(vlen == 8, "sanity");
4090           __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4091         }
4092       }
4093     }
4094   %}
4095   ins_pipe( fpu_reg_reg );
4096 %}
4097 
4098 instruct ReplS_zero(vec dst, immI_0 zero) %{
4099   match(Set dst (ReplicateS zero));
4100   format %{ "replicateS $dst,$zero" %}
4101   ins_encode %{
4102     uint vlen = Matcher::vector_length(this);
4103     if (vlen <= 8) {
4104       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4105     } else {
4106       int vlen_enc = vector_length_encoding(this);
4107       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4108     }
4109   %}
4110   ins_pipe( fpu_reg_reg );
4111 %}
4112 
4113 // ====================ReplicateI=======================================
4114 
4115 instruct ReplI_reg(vec dst, rRegI src) %{
4116   match(Set dst (ReplicateI src));
4117   format %{ "replicateI $dst,$src" %}
4118   ins_encode %{
4119     uint vlen = Matcher::vector_length(this);
4120     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4121       int vlen_enc = vector_length_encoding(this);
4122       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
4123     } else if (VM_Version::supports_avx2()) {
4124       int vlen_enc = vector_length_encoding(this);
4125       __ movdl($dst$$XMMRegister, $src$$Register);
4126       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4127     } else {
4128       __ movdl($dst$$XMMRegister, $src$$Register);
4129       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4130       if (vlen >= 8) {
4131         assert(vlen == 8, "sanity");
4132         __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4133       }
4134     }
4135   %}
4136   ins_pipe( pipe_slow );
4137 %}
4138 
4139 instruct ReplI_mem(vec dst, memory mem) %{
4140   match(Set dst (ReplicateI (LoadI mem)));
4141   format %{ "replicateI $dst,$mem" %}
4142   ins_encode %{
4143     uint vlen = Matcher::vector_length(this);
4144     if (vlen <= 4) {
4145       __ movdl($dst$$XMMRegister, $mem$$Address);
4146       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4147     } else {
4148       assert(VM_Version::supports_avx2(), "sanity");
4149       int vlen_enc = vector_length_encoding(this);
4150       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4151     }
4152   %}
4153   ins_pipe( pipe_slow );
4154 %}
4155 
4156 instruct ReplI_imm(vec dst, immI con) %{
4157   match(Set dst (ReplicateI con));
4158   format %{ "replicateI $dst,$con" %}
4159   ins_encode %{
4160     uint vlen = Matcher::vector_length(this);
4161     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 4));
4162     if (vlen <= 4) {
4163       __ movq($dst$$XMMRegister, const_addr);
4164       if (vlen == 4) {
4165         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4166       }
4167     } else {
4168       assert(VM_Version::supports_avx2(), "sanity");
4169       int vlen_enc = vector_length_encoding(this);
4170       __ movq($dst$$XMMRegister, const_addr);
4171       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4172     }
4173   %}
4174   ins_pipe( pipe_slow );
4175 %}
4176 
4177 // Replicate integer (4 byte) scalar zero to be vector
4178 instruct ReplI_zero(vec dst, immI_0 zero) %{
4179   match(Set dst (ReplicateI zero));
4180   format %{ "replicateI $dst,$zero" %}
4181   ins_encode %{
4182     uint vlen = Matcher::vector_length(this);
4183     if (vlen <= 4) {
4184       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4185     } else {
4186       int vlen_enc = vector_length_encoding(this);
4187       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4188     }
4189   %}
4190   ins_pipe( fpu_reg_reg );
4191 %}
4192 
4193 instruct ReplI_M1(vec dst, immI_M1 con) %{
4194   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) >= 16);
4195   match(Set dst (ReplicateB con));
4196   match(Set dst (ReplicateS con));
4197   match(Set dst (ReplicateI con));
4198   effect(TEMP dst);
4199   format %{ "vallones $dst" %}
4200   ins_encode %{
4201     int vector_len = vector_length_encoding(this);
4202     __ vallones($dst$$XMMRegister, vector_len);
4203   %}
4204   ins_pipe( pipe_slow );
4205 %}
4206 
4207 // ====================ReplicateL=======================================
4208 
4209 #ifdef _LP64
4210 // Replicate long (8 byte) scalar to be vector
4211 instruct ReplL_reg(vec dst, rRegL src) %{
4212   match(Set dst (ReplicateL src));
4213   format %{ "replicateL $dst,$src" %}
4214   ins_encode %{
4215     uint vlen = Matcher::vector_length(this);
4216     if (vlen == 2) {
4217       __ movdq($dst$$XMMRegister, $src$$Register);
4218       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4219     } else if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4220       int vlen_enc = vector_length_encoding(this);
4221       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
4222     } else if (VM_Version::supports_avx2()) {
4223       assert(vlen == 4, "sanity");
4224       int vlen_enc = vector_length_encoding(this);
4225       __ movdq($dst$$XMMRegister, $src$$Register);
4226       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4227     } else {
4228       assert(vlen == 4, "sanity");
4229       __ movdq($dst$$XMMRegister, $src$$Register);
4230       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4231       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4232     }
4233   %}
4234   ins_pipe( pipe_slow );
4235 %}
4236 #else // _LP64
4237 // Replicate long (8 byte) scalar to be vector
4238 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
4239   predicate(Matcher::vector_length(n) <= 4);
4240   match(Set dst (ReplicateL src));
4241   effect(TEMP dst, USE src, TEMP tmp);
4242   format %{ "replicateL $dst,$src" %}
4243   ins_encode %{
4244     uint vlen = Matcher::vector_length(this);
4245     if (vlen == 2) {
4246       __ movdl($dst$$XMMRegister, $src$$Register);
4247       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4248       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4249       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4250     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4251       int vlen_enc = Assembler::AVX_256bit;
4252       __ movdl($dst$$XMMRegister, $src$$Register);
4253       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4254       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4255       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4256     } else {
4257       __ movdl($dst$$XMMRegister, $src$$Register);
4258       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4259       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4260       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4261       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4262     }
4263   %}
4264   ins_pipe( pipe_slow );
4265 %}
4266 
4267 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
4268   predicate(Matcher::vector_length(n) == 8);
4269   match(Set dst (ReplicateL src));
4270   effect(TEMP dst, USE src, TEMP tmp);
4271   format %{ "replicateL $dst,$src" %}
4272   ins_encode %{
4273     if (VM_Version::supports_avx512vl()) {
4274       __ movdl($dst$$XMMRegister, $src$$Register);
4275       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4276       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4277       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4278       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4279       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4280     } else {
4281       int vlen_enc = Assembler::AVX_512bit;
4282       __ movdl($dst$$XMMRegister, $src$$Register);
4283       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4284       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4285       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4286     }
4287   %}
4288   ins_pipe( pipe_slow );
4289 %}
4290 #endif // _LP64
4291 
4292 instruct ReplL_mem(vec dst, memory mem) %{
4293   match(Set dst (ReplicateL (LoadL mem)));
4294   format %{ "replicateL $dst,$mem" %}
4295   ins_encode %{
4296     uint vlen = Matcher::vector_length(this);
4297     if (vlen == 2) {
4298       __ movq($dst$$XMMRegister, $mem$$Address);
4299       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4300     } else {
4301       assert(VM_Version::supports_avx2(), "sanity");
4302       int vlen_enc = vector_length_encoding(this);
4303       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
4304     }
4305   %}
4306   ins_pipe( pipe_slow );
4307 %}
4308 
4309 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4310 instruct ReplL_imm(vec dst, immL con) %{
4311   match(Set dst (ReplicateL con));
4312   format %{ "replicateL $dst,$con" %}
4313   ins_encode %{
4314     uint vlen = Matcher::vector_length(this);
4315     InternalAddress const_addr = $constantaddress($con);
4316     if (vlen == 2) {
4317       __ movq($dst$$XMMRegister, const_addr);
4318       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4319     } else {
4320       assert(VM_Version::supports_avx2(), "sanity");
4321       int vlen_enc = vector_length_encoding(this);
4322       __ movq($dst$$XMMRegister, const_addr);
4323       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4324     }
4325   %}
4326   ins_pipe( pipe_slow );
4327 %}
4328 
4329 instruct ReplL_zero(vec dst, immL0 zero) %{
4330   match(Set dst (ReplicateL zero));
4331   format %{ "replicateL $dst,$zero" %}
4332   ins_encode %{
4333     int vlen = Matcher::vector_length(this);
4334     if (vlen == 2) {
4335       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4336     } else {
4337       int vlen_enc = vector_length_encoding(this);
4338       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4339     }
4340   %}
4341   ins_pipe( fpu_reg_reg );
4342 %}
4343 
4344 instruct ReplL_M1(vec dst, immL_M1 con) %{
4345   predicate(UseAVX > 0);
4346   match(Set dst (ReplicateL con));
4347   effect(TEMP dst);
4348   format %{ "vallones $dst" %}
4349   ins_encode %{
4350     int vector_len = vector_length_encoding(this);
4351     __ vallones($dst$$XMMRegister, vector_len);
4352   %}
4353   ins_pipe( pipe_slow );
4354 %}
4355 
4356 // ====================ReplicateF=======================================
4357 
4358 instruct ReplF_reg(vec dst, vlRegF src) %{
4359   match(Set dst (ReplicateF src));
4360   format %{ "replicateF $dst,$src" %}
4361   ins_encode %{
4362     uint vlen = Matcher::vector_length(this);
4363     if (vlen <= 4) {
4364       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4365    } else if (VM_Version::supports_avx2()) {
4366       int vlen_enc = vector_length_encoding(this);
4367       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4368     } else {
4369       assert(vlen == 8, "sanity");
4370       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4371       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4372     }
4373   %}
4374   ins_pipe( pipe_slow );
4375 %}
4376 
4377 instruct ReplF_mem(vec dst, memory mem) %{
4378   match(Set dst (ReplicateF (LoadF mem)));
4379   format %{ "replicateF $dst,$mem" %}
4380   ins_encode %{
4381     uint vlen = Matcher::vector_length(this);
4382     if (vlen <= 4) {
4383       __ movdl($dst$$XMMRegister, $mem$$Address);
4384       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4385     } else {
4386       assert(VM_Version::supports_avx(), "sanity");
4387       int vlen_enc = vector_length_encoding(this);
4388       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
4389     }
4390   %}
4391   ins_pipe( pipe_slow );
4392 %}
4393 
4394 instruct ReplF_zero(vec dst, immF0 zero) %{
4395   match(Set dst (ReplicateF zero));
4396   format %{ "replicateF $dst,$zero" %}
4397   ins_encode %{
4398     uint vlen = Matcher::vector_length(this);
4399     if (vlen <= 4) {
4400       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4401     } else {
4402       int vlen_enc = vector_length_encoding(this);
4403       __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4404     }
4405   %}
4406   ins_pipe( fpu_reg_reg );
4407 %}
4408 
4409 // ====================ReplicateD=======================================
4410 
4411 // Replicate double (8 bytes) scalar to be vector
4412 instruct ReplD_reg(vec dst, vlRegD src) %{
4413   match(Set dst (ReplicateD src));
4414   format %{ "replicateD $dst,$src" %}
4415   ins_encode %{
4416     uint vlen = Matcher::vector_length(this);
4417     if (vlen == 2) {
4418       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4419     } else if (VM_Version::supports_avx2()) {
4420       int vlen_enc = vector_length_encoding(this);
4421       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4422     } else {
4423       assert(vlen == 4, "sanity");
4424       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4425       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4426     }
4427   %}
4428   ins_pipe( pipe_slow );
4429 %}
4430 
4431 instruct ReplD_mem(vec dst, memory mem) %{
4432   match(Set dst (ReplicateD (LoadD mem)));
4433   format %{ "replicateD $dst,$mem" %}
4434   ins_encode %{
4435     uint vlen = Matcher::vector_length(this);
4436     if (vlen == 2) {
4437       __ movq($dst$$XMMRegister, $mem$$Address);
4438       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44);
4439     } else {
4440       assert(VM_Version::supports_avx(), "sanity");
4441       int vlen_enc = vector_length_encoding(this);
4442       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4443     }
4444   %}
4445   ins_pipe( pipe_slow );
4446 %}
4447 
4448 instruct ReplD_zero(vec dst, immD0 zero) %{
4449   match(Set dst (ReplicateD zero));
4450   format %{ "replicateD $dst,$zero" %}
4451   ins_encode %{
4452     uint vlen = Matcher::vector_length(this);
4453     if (vlen == 2) {
4454       __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4455     } else {
4456       int vlen_enc = vector_length_encoding(this);
4457       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4458     }
4459   %}
4460   ins_pipe( fpu_reg_reg );
4461 %}
4462 
4463 // ====================VECTOR INSERT=======================================
4464 
4465 instruct insert(vec dst, rRegI val, immU8 idx) %{
4466   predicate(Matcher::vector_length_in_bytes(n) < 32);
4467   match(Set dst (VectorInsert (Binary dst val) idx));
4468   format %{ "vector_insert $dst,$val,$idx" %}
4469   ins_encode %{
4470     assert(UseSSE >= 4, "required");
4471     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
4472 
4473     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4474 
4475     assert(is_integral_type(elem_bt), "");
4476     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4477 
4478     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
4479   %}
4480   ins_pipe( pipe_slow );
4481 %}
4482 
4483 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
4484   predicate(Matcher::vector_length_in_bytes(n) == 32);
4485   match(Set dst (VectorInsert (Binary src val) idx));
4486   effect(TEMP vtmp);
4487   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4488   ins_encode %{
4489     int vlen_enc = Assembler::AVX_256bit;
4490     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4491     int elem_per_lane = 16/type2aelembytes(elem_bt);
4492     int log2epr = log2(elem_per_lane);
4493 
4494     assert(is_integral_type(elem_bt), "sanity");
4495     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4496 
4497     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4498     uint y_idx = ($idx$$constant >> log2epr) & 1;
4499     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4500     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4501     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4502   %}
4503   ins_pipe( pipe_slow );
4504 %}
4505 
4506 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
4507   predicate(Matcher::vector_length_in_bytes(n) == 64);
4508   match(Set dst (VectorInsert (Binary src val) idx));
4509   effect(TEMP vtmp);
4510   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4511   ins_encode %{
4512     assert(UseAVX > 2, "sanity");
4513 
4514     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4515     int elem_per_lane = 16/type2aelembytes(elem_bt);
4516     int log2epr = log2(elem_per_lane);
4517 
4518     assert(is_integral_type(elem_bt), "");
4519     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4520 
4521     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4522     uint y_idx = ($idx$$constant >> log2epr) & 3;
4523     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4524     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4525     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4526   %}
4527   ins_pipe( pipe_slow );
4528 %}
4529 
4530 #ifdef _LP64
4531 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
4532   predicate(Matcher::vector_length(n) == 2);
4533   match(Set dst (VectorInsert (Binary dst val) idx));
4534   format %{ "vector_insert $dst,$val,$idx" %}
4535   ins_encode %{
4536     assert(UseSSE >= 4, "required");
4537     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4538     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4539 
4540     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
4541   %}
4542   ins_pipe( pipe_slow );
4543 %}
4544 
4545 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
4546   predicate(Matcher::vector_length(n) == 4);
4547   match(Set dst (VectorInsert (Binary src val) idx));
4548   effect(TEMP vtmp);
4549   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4550   ins_encode %{
4551     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4552     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4553 
4554     uint x_idx = $idx$$constant & right_n_bits(1);
4555     uint y_idx = ($idx$$constant >> 1) & 1;
4556     int vlen_enc = Assembler::AVX_256bit;
4557     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4558     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4559     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4560   %}
4561   ins_pipe( pipe_slow );
4562 %}
4563 
4564 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
4565   predicate(Matcher::vector_length(n) == 8);
4566   match(Set dst (VectorInsert (Binary src val) idx));
4567   effect(TEMP vtmp);
4568   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4569   ins_encode %{
4570     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
4571     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4572 
4573     uint x_idx = $idx$$constant & right_n_bits(1);
4574     uint y_idx = ($idx$$constant >> 1) & 3;
4575     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4576     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4577     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4578   %}
4579   ins_pipe( pipe_slow );
4580 %}
4581 #endif
4582 
4583 instruct insertF(vec dst, regF val, immU8 idx) %{
4584   predicate(Matcher::vector_length(n) < 8);
4585   match(Set dst (VectorInsert (Binary dst val) idx));
4586   format %{ "vector_insert $dst,$val,$idx" %}
4587   ins_encode %{
4588     assert(UseSSE >= 4, "sanity");
4589 
4590     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4591     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4592 
4593     uint x_idx = $idx$$constant & right_n_bits(2);
4594     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
4595   %}
4596   ins_pipe( pipe_slow );
4597 %}
4598 
4599 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
4600   predicate(Matcher::vector_length(n) >= 8);
4601   match(Set dst (VectorInsert (Binary src val) idx));
4602   effect(TEMP vtmp);
4603   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4604   ins_encode %{
4605     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4606     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4607 
4608     int vlen = Matcher::vector_length(this);
4609     uint x_idx = $idx$$constant & right_n_bits(2);
4610     if (vlen == 8) {
4611       uint y_idx = ($idx$$constant >> 2) & 1;
4612       int vlen_enc = Assembler::AVX_256bit;
4613       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4614       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
4615       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4616     } else {
4617       assert(vlen == 16, "sanity");
4618       uint y_idx = ($idx$$constant >> 2) & 3;
4619       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4620       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
4621       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4622     }
4623   %}
4624   ins_pipe( pipe_slow );
4625 %}
4626 
4627 #ifdef _LP64
4628 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
4629   predicate(Matcher::vector_length(n) == 2);
4630   match(Set dst (VectorInsert (Binary dst val) idx));
4631   effect(TEMP tmp);
4632   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
4633   ins_encode %{
4634     assert(UseSSE >= 4, "sanity");
4635     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4636     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4637 
4638     __ movq($tmp$$Register, $val$$XMMRegister);
4639     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
4640   %}
4641   ins_pipe( pipe_slow );
4642 %}
4643 
4644 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
4645   predicate(Matcher::vector_length(n) == 4);
4646   match(Set dst (VectorInsert (Binary src val) idx));
4647   effect(TEMP vtmp, TEMP tmp);
4648   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
4649   ins_encode %{
4650     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4651     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4652 
4653     uint x_idx = $idx$$constant & right_n_bits(1);
4654     uint y_idx = ($idx$$constant >> 1) & 1;
4655     int vlen_enc = Assembler::AVX_256bit;
4656     __ movq($tmp$$Register, $val$$XMMRegister);
4657     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4658     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4659     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4660   %}
4661   ins_pipe( pipe_slow );
4662 %}
4663 
4664 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
4665   predicate(Matcher::vector_length(n) == 8);
4666   match(Set dst (VectorInsert (Binary src val) idx));
4667   effect(TEMP tmp, TEMP vtmp);
4668   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4669   ins_encode %{
4670     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4671     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4672 
4673     uint x_idx = $idx$$constant & right_n_bits(1);
4674     uint y_idx = ($idx$$constant >> 1) & 3;
4675     __ movq($tmp$$Register, $val$$XMMRegister);
4676     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4677     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4678     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4679   %}
4680   ins_pipe( pipe_slow );
4681 %}
4682 #endif
4683 
4684 // ====================REDUCTION ARITHMETIC=======================================
4685 
4686 // =======================Int Reduction==========================================
4687 
4688 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4689   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
4690   match(Set dst (AddReductionVI src1 src2));
4691   match(Set dst (MulReductionVI src1 src2));
4692   match(Set dst (AndReductionV  src1 src2));
4693   match(Set dst ( OrReductionV  src1 src2));
4694   match(Set dst (XorReductionV  src1 src2));
4695   match(Set dst (MinReductionV  src1 src2));
4696   match(Set dst (MaxReductionV  src1 src2));
4697   effect(TEMP vtmp1, TEMP vtmp2);
4698   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4699   ins_encode %{
4700     int opcode = this->ideal_Opcode();
4701     int vlen = Matcher::vector_length(this, $src2);
4702     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4703   %}
4704   ins_pipe( pipe_slow );
4705 %}
4706 
4707 // =======================Long Reduction==========================================
4708 
4709 #ifdef _LP64
4710 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4711   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
4712   match(Set dst (AddReductionVL src1 src2));
4713   match(Set dst (MulReductionVL src1 src2));
4714   match(Set dst (AndReductionV  src1 src2));
4715   match(Set dst ( OrReductionV  src1 src2));
4716   match(Set dst (XorReductionV  src1 src2));
4717   match(Set dst (MinReductionV  src1 src2));
4718   match(Set dst (MaxReductionV  src1 src2));
4719   effect(TEMP vtmp1, TEMP vtmp2);
4720   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4721   ins_encode %{
4722     int opcode = this->ideal_Opcode();
4723     int vlen = Matcher::vector_length(this, $src2);
4724     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4725   %}
4726   ins_pipe( pipe_slow );
4727 %}
4728 
4729 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
4730   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
4731   match(Set dst (AddReductionVL src1 src2));
4732   match(Set dst (MulReductionVL src1 src2));
4733   match(Set dst (AndReductionV  src1 src2));
4734   match(Set dst ( OrReductionV  src1 src2));
4735   match(Set dst (XorReductionV  src1 src2));
4736   match(Set dst (MinReductionV  src1 src2));
4737   match(Set dst (MaxReductionV  src1 src2));
4738   effect(TEMP vtmp1, TEMP vtmp2);
4739   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4740   ins_encode %{
4741     int opcode = this->ideal_Opcode();
4742     int vlen = Matcher::vector_length(this, $src2);
4743     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4744   %}
4745   ins_pipe( pipe_slow );
4746 %}
4747 #endif // _LP64
4748 
4749 // =======================Float Reduction==========================================
4750 
4751 instruct reductionF128(regF dst, vec src, vec vtmp) %{
4752   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
4753   match(Set dst (AddReductionVF dst src));
4754   match(Set dst (MulReductionVF dst src));
4755   effect(TEMP dst, TEMP vtmp);
4756   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
4757   ins_encode %{
4758     int opcode = this->ideal_Opcode();
4759     int vlen = Matcher::vector_length(this, $src);
4760     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4761   %}
4762   ins_pipe( pipe_slow );
4763 %}
4764 
4765 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
4766   predicate(Matcher::vector_length(n->in(2)) == 8); // src
4767   match(Set dst (AddReductionVF dst src));
4768   match(Set dst (MulReductionVF dst src));
4769   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4770   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4771   ins_encode %{
4772     int opcode = this->ideal_Opcode();
4773     int vlen = Matcher::vector_length(this, $src);
4774     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4775   %}
4776   ins_pipe( pipe_slow );
4777 %}
4778 
4779 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4780   predicate(Matcher::vector_length(n->in(2)) == 16); // src
4781   match(Set dst (AddReductionVF dst src));
4782   match(Set dst (MulReductionVF dst src));
4783   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4784   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4785   ins_encode %{
4786     int opcode = this->ideal_Opcode();
4787     int vlen = Matcher::vector_length(this, $src);
4788     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4789   %}
4790   ins_pipe( pipe_slow );
4791 %}
4792 
4793 // =======================Double Reduction==========================================
4794 
4795 instruct reduction2D(regD dst, vec src, vec vtmp) %{
4796   predicate(Matcher::vector_length(n->in(2)) == 2); // src
4797   match(Set dst (AddReductionVD dst src));
4798   match(Set dst (MulReductionVD dst src));
4799   effect(TEMP dst, TEMP vtmp);
4800   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
4801   ins_encode %{
4802     int opcode = this->ideal_Opcode();
4803     int vlen = Matcher::vector_length(this, $src);
4804     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4805 %}
4806   ins_pipe( pipe_slow );
4807 %}
4808 
4809 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
4810   predicate(Matcher::vector_length(n->in(2)) == 4); // src
4811   match(Set dst (AddReductionVD dst src));
4812   match(Set dst (MulReductionVD dst src));
4813   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4814   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4815   ins_encode %{
4816     int opcode = this->ideal_Opcode();
4817     int vlen = Matcher::vector_length(this, $src);
4818     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4819   %}
4820   ins_pipe( pipe_slow );
4821 %}
4822 
4823 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4824   predicate(Matcher::vector_length(n->in(2)) == 8); // src
4825   match(Set dst (AddReductionVD dst src));
4826   match(Set dst (MulReductionVD dst src));
4827   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4828   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4829   ins_encode %{
4830     int opcode = this->ideal_Opcode();
4831     int vlen = Matcher::vector_length(this, $src);
4832     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4833   %}
4834   ins_pipe( pipe_slow );
4835 %}
4836 
4837 // =======================Byte Reduction==========================================
4838 
4839 #ifdef _LP64
4840 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4841   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
4842   match(Set dst (AddReductionVI src1 src2));
4843   match(Set dst (AndReductionV  src1 src2));
4844   match(Set dst ( OrReductionV  src1 src2));
4845   match(Set dst (XorReductionV  src1 src2));
4846   match(Set dst (MinReductionV  src1 src2));
4847   match(Set dst (MaxReductionV  src1 src2));
4848   effect(TEMP vtmp1, TEMP vtmp2);
4849   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4850   ins_encode %{
4851     int opcode = this->ideal_Opcode();
4852     int vlen = Matcher::vector_length(this, $src2);
4853     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4854   %}
4855   ins_pipe( pipe_slow );
4856 %}
4857 
4858 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4859   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
4860   match(Set dst (AddReductionVI src1 src2));
4861   match(Set dst (AndReductionV  src1 src2));
4862   match(Set dst ( OrReductionV  src1 src2));
4863   match(Set dst (XorReductionV  src1 src2));
4864   match(Set dst (MinReductionV  src1 src2));
4865   match(Set dst (MaxReductionV  src1 src2));
4866   effect(TEMP vtmp1, TEMP vtmp2);
4867   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4868   ins_encode %{
4869     int opcode = this->ideal_Opcode();
4870     int vlen = Matcher::vector_length(this, $src2);
4871     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4872   %}
4873   ins_pipe( pipe_slow );
4874 %}
4875 #endif
4876 
4877 // =======================Short Reduction==========================================
4878 
4879 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4880   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
4881   match(Set dst (AddReductionVI src1 src2));
4882   match(Set dst (MulReductionVI src1 src2));
4883   match(Set dst (AndReductionV  src1 src2));
4884   match(Set dst ( OrReductionV  src1 src2));
4885   match(Set dst (XorReductionV  src1 src2));
4886   match(Set dst (MinReductionV  src1 src2));
4887   match(Set dst (MaxReductionV  src1 src2));
4888   effect(TEMP vtmp1, TEMP vtmp2);
4889   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4890   ins_encode %{
4891     int opcode = this->ideal_Opcode();
4892     int vlen = Matcher::vector_length(this, $src2);
4893     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4894   %}
4895   ins_pipe( pipe_slow );
4896 %}
4897 
4898 // =======================Mul Reduction==========================================
4899 
4900 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4901   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
4902             Matcher::vector_length(n->in(2)) <= 32); // src2
4903   match(Set dst (MulReductionVI src1 src2));
4904   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4905   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4906   ins_encode %{
4907     int opcode = this->ideal_Opcode();
4908     int vlen = Matcher::vector_length(this, $src2);
4909     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4910   %}
4911   ins_pipe( pipe_slow );
4912 %}
4913 
4914 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4915   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
4916             Matcher::vector_length(n->in(2)) == 64); // src2
4917   match(Set dst (MulReductionVI src1 src2));
4918   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4919   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4920   ins_encode %{
4921     int opcode = this->ideal_Opcode();
4922     int vlen = Matcher::vector_length(this, $src2);
4923     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4924   %}
4925   ins_pipe( pipe_slow );
4926 %}
4927 
4928 //--------------------Min/Max Float Reduction --------------------
4929 // Float Min Reduction
4930 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
4931                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4932   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4933             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4934              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4935             Matcher::vector_length(n->in(2)) == 2);
4936   match(Set dst (MinReductionV src1 src2));
4937   match(Set dst (MaxReductionV src1 src2));
4938   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4939   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4940   ins_encode %{
4941     assert(UseAVX > 0, "sanity");
4942 
4943     int opcode = this->ideal_Opcode();
4944     int vlen = Matcher::vector_length(this, $src2);
4945     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4946                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4947   %}
4948   ins_pipe( pipe_slow );
4949 %}
4950 
4951 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
4952                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4953   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4954             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4955              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4956             Matcher::vector_length(n->in(2)) >= 4);
4957   match(Set dst (MinReductionV src1 src2));
4958   match(Set dst (MaxReductionV src1 src2));
4959   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4960   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4961   ins_encode %{
4962     assert(UseAVX > 0, "sanity");
4963 
4964     int opcode = this->ideal_Opcode();
4965     int vlen = Matcher::vector_length(this, $src2);
4966     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4967                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4968   %}
4969   ins_pipe( pipe_slow );
4970 %}
4971 
4972 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
4973                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4974   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4975             Matcher::vector_length(n->in(2)) == 2);
4976   match(Set dst (MinReductionV dst src));
4977   match(Set dst (MaxReductionV dst src));
4978   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4979   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4980   ins_encode %{
4981     assert(UseAVX > 0, "sanity");
4982 
4983     int opcode = this->ideal_Opcode();
4984     int vlen = Matcher::vector_length(this, $src);
4985     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4986                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4987   %}
4988   ins_pipe( pipe_slow );
4989 %}
4990 
4991 
4992 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
4993                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4994   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4995             Matcher::vector_length(n->in(2)) >= 4);
4996   match(Set dst (MinReductionV dst src));
4997   match(Set dst (MaxReductionV dst src));
4998   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4999   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
5000   ins_encode %{
5001     assert(UseAVX > 0, "sanity");
5002 
5003     int opcode = this->ideal_Opcode();
5004     int vlen = Matcher::vector_length(this, $src);
5005     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
5006                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
5007   %}
5008   ins_pipe( pipe_slow );
5009 %}
5010 
5011 
5012 //--------------------Min Double Reduction --------------------
5013 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
5014                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
5015                             rFlagsReg cr) %{
5016   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5017             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
5018              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
5019             Matcher::vector_length(n->in(2)) == 2);
5020   match(Set dst (MinReductionV src1 src2));
5021   match(Set dst (MaxReductionV src1 src2));
5022   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
5023   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
5024   ins_encode %{
5025     assert(UseAVX > 0, "sanity");
5026 
5027     int opcode = this->ideal_Opcode();
5028     int vlen = Matcher::vector_length(this, $src2);
5029     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
5030                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
5031   %}
5032   ins_pipe( pipe_slow );
5033 %}
5034 
5035 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
5036                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
5037                            rFlagsReg cr) %{
5038   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5039             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
5040              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
5041             Matcher::vector_length(n->in(2)) >= 4);
5042   match(Set dst (MinReductionV src1 src2));
5043   match(Set dst (MaxReductionV src1 src2));
5044   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
5045   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
5046   ins_encode %{
5047     assert(UseAVX > 0, "sanity");
5048 
5049     int opcode = this->ideal_Opcode();
5050     int vlen = Matcher::vector_length(this, $src2);
5051     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
5052                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
5053   %}
5054   ins_pipe( pipe_slow );
5055 %}
5056 
5057 
5058 instruct minmax_reduction2D_av(legRegD dst, legVec src,
5059                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
5060                                rFlagsReg cr) %{
5061   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5062             Matcher::vector_length(n->in(2)) == 2);
5063   match(Set dst (MinReductionV dst src));
5064   match(Set dst (MaxReductionV dst src));
5065   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
5066   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
5067   ins_encode %{
5068     assert(UseAVX > 0, "sanity");
5069 
5070     int opcode = this->ideal_Opcode();
5071     int vlen = Matcher::vector_length(this, $src);
5072     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
5073                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
5074   %}
5075   ins_pipe( pipe_slow );
5076 %}
5077 
5078 instruct minmax_reductionD_av(legRegD dst, legVec src,
5079                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
5080                               rFlagsReg cr) %{
5081   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5082             Matcher::vector_length(n->in(2)) >= 4);
5083   match(Set dst (MinReductionV dst src));
5084   match(Set dst (MaxReductionV dst src));
5085   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
5086   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
5087   ins_encode %{
5088     assert(UseAVX > 0, "sanity");
5089 
5090     int opcode = this->ideal_Opcode();
5091     int vlen = Matcher::vector_length(this, $src);
5092     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
5093                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
5094   %}
5095   ins_pipe( pipe_slow );
5096 %}
5097 
5098 // ====================VECTOR ARITHMETIC=======================================
5099 
5100 // --------------------------------- ADD --------------------------------------
5101 
5102 // Bytes vector add
5103 instruct vaddB(vec dst, vec src) %{
5104   predicate(UseAVX == 0);
5105   match(Set dst (AddVB dst src));
5106   format %{ "paddb   $dst,$src\t! add packedB" %}
5107   ins_encode %{
5108     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5109   %}
5110   ins_pipe( pipe_slow );
5111 %}
5112 
5113 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
5114   predicate(UseAVX > 0);
5115   match(Set dst (AddVB src1 src2));
5116   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
5117   ins_encode %{
5118     int vlen_enc = vector_length_encoding(this);
5119     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5120   %}
5121   ins_pipe( pipe_slow );
5122 %}
5123 
5124 instruct vaddB_mem(vec dst, vec src, memory mem) %{
5125   predicate((UseAVX > 0) &&
5126             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5127   match(Set dst (AddVB src (LoadVector mem)));
5128   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
5129   ins_encode %{
5130     int vlen_enc = vector_length_encoding(this);
5131     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5132   %}
5133   ins_pipe( pipe_slow );
5134 %}
5135 
5136 // Shorts/Chars vector add
5137 instruct vaddS(vec dst, vec src) %{
5138   predicate(UseAVX == 0);
5139   match(Set dst (AddVS dst src));
5140   format %{ "paddw   $dst,$src\t! add packedS" %}
5141   ins_encode %{
5142     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5143   %}
5144   ins_pipe( pipe_slow );
5145 %}
5146 
5147 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
5148   predicate(UseAVX > 0);
5149   match(Set dst (AddVS src1 src2));
5150   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
5151   ins_encode %{
5152     int vlen_enc = vector_length_encoding(this);
5153     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5154   %}
5155   ins_pipe( pipe_slow );
5156 %}
5157 
5158 instruct vaddS_mem(vec dst, vec src, memory mem) %{
5159   predicate((UseAVX > 0) &&
5160             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5161   match(Set dst (AddVS src (LoadVector mem)));
5162   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
5163   ins_encode %{
5164     int vlen_enc = vector_length_encoding(this);
5165     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5166   %}
5167   ins_pipe( pipe_slow );
5168 %}
5169 
5170 // Integers vector add
5171 instruct vaddI(vec dst, vec src) %{
5172   predicate(UseAVX == 0);
5173   match(Set dst (AddVI dst src));
5174   format %{ "paddd   $dst,$src\t! add packedI" %}
5175   ins_encode %{
5176     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
5177   %}
5178   ins_pipe( pipe_slow );
5179 %}
5180 
5181 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
5182   predicate(UseAVX > 0);
5183   match(Set dst (AddVI src1 src2));
5184   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
5185   ins_encode %{
5186     int vlen_enc = vector_length_encoding(this);
5187     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5188   %}
5189   ins_pipe( pipe_slow );
5190 %}
5191 
5192 
5193 instruct vaddI_mem(vec dst, vec src, memory mem) %{
5194   predicate((UseAVX > 0) &&
5195             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5196   match(Set dst (AddVI src (LoadVector mem)));
5197   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
5198   ins_encode %{
5199     int vlen_enc = vector_length_encoding(this);
5200     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5201   %}
5202   ins_pipe( pipe_slow );
5203 %}
5204 
5205 // Longs vector add
5206 instruct vaddL(vec dst, vec src) %{
5207   predicate(UseAVX == 0);
5208   match(Set dst (AddVL dst src));
5209   format %{ "paddq   $dst,$src\t! add packedL" %}
5210   ins_encode %{
5211     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
5212   %}
5213   ins_pipe( pipe_slow );
5214 %}
5215 
5216 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
5217   predicate(UseAVX > 0);
5218   match(Set dst (AddVL src1 src2));
5219   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
5220   ins_encode %{
5221     int vlen_enc = vector_length_encoding(this);
5222     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5223   %}
5224   ins_pipe( pipe_slow );
5225 %}
5226 
5227 instruct vaddL_mem(vec dst, vec src, memory mem) %{
5228   predicate((UseAVX > 0) &&
5229             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5230   match(Set dst (AddVL src (LoadVector mem)));
5231   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
5232   ins_encode %{
5233     int vlen_enc = vector_length_encoding(this);
5234     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5235   %}
5236   ins_pipe( pipe_slow );
5237 %}
5238 
5239 // Floats vector add
5240 instruct vaddF(vec dst, vec src) %{
5241   predicate(UseAVX == 0);
5242   match(Set dst (AddVF dst src));
5243   format %{ "addps   $dst,$src\t! add packedF" %}
5244   ins_encode %{
5245     __ addps($dst$$XMMRegister, $src$$XMMRegister);
5246   %}
5247   ins_pipe( pipe_slow );
5248 %}
5249 
5250 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
5251   predicate(UseAVX > 0);
5252   match(Set dst (AddVF src1 src2));
5253   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
5254   ins_encode %{
5255     int vlen_enc = vector_length_encoding(this);
5256     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5257   %}
5258   ins_pipe( pipe_slow );
5259 %}
5260 
5261 instruct vaddF_mem(vec dst, vec src, memory mem) %{
5262   predicate((UseAVX > 0) &&
5263             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5264   match(Set dst (AddVF src (LoadVector mem)));
5265   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
5266   ins_encode %{
5267     int vlen_enc = vector_length_encoding(this);
5268     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5269   %}
5270   ins_pipe( pipe_slow );
5271 %}
5272 
5273 // Doubles vector add
5274 instruct vaddD(vec dst, vec src) %{
5275   predicate(UseAVX == 0);
5276   match(Set dst (AddVD dst src));
5277   format %{ "addpd   $dst,$src\t! add packedD" %}
5278   ins_encode %{
5279     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
5280   %}
5281   ins_pipe( pipe_slow );
5282 %}
5283 
5284 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
5285   predicate(UseAVX > 0);
5286   match(Set dst (AddVD src1 src2));
5287   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
5288   ins_encode %{
5289     int vlen_enc = vector_length_encoding(this);
5290     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5291   %}
5292   ins_pipe( pipe_slow );
5293 %}
5294 
5295 instruct vaddD_mem(vec dst, vec src, memory mem) %{
5296   predicate((UseAVX > 0) &&
5297             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5298   match(Set dst (AddVD src (LoadVector mem)));
5299   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
5300   ins_encode %{
5301     int vlen_enc = vector_length_encoding(this);
5302     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5303   %}
5304   ins_pipe( pipe_slow );
5305 %}
5306 
5307 // --------------------------------- SUB --------------------------------------
5308 
5309 // Bytes vector sub
5310 instruct vsubB(vec dst, vec src) %{
5311   predicate(UseAVX == 0);
5312   match(Set dst (SubVB dst src));
5313   format %{ "psubb   $dst,$src\t! sub packedB" %}
5314   ins_encode %{
5315     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
5316   %}
5317   ins_pipe( pipe_slow );
5318 %}
5319 
5320 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
5321   predicate(UseAVX > 0);
5322   match(Set dst (SubVB src1 src2));
5323   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
5324   ins_encode %{
5325     int vlen_enc = vector_length_encoding(this);
5326     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5327   %}
5328   ins_pipe( pipe_slow );
5329 %}
5330 
5331 instruct vsubB_mem(vec dst, vec src, memory mem) %{
5332   predicate((UseAVX > 0) &&
5333             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5334   match(Set dst (SubVB src (LoadVector mem)));
5335   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
5336   ins_encode %{
5337     int vlen_enc = vector_length_encoding(this);
5338     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5339   %}
5340   ins_pipe( pipe_slow );
5341 %}
5342 
5343 // Shorts/Chars vector sub
5344 instruct vsubS(vec dst, vec src) %{
5345   predicate(UseAVX == 0);
5346   match(Set dst (SubVS dst src));
5347   format %{ "psubw   $dst,$src\t! sub packedS" %}
5348   ins_encode %{
5349     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
5350   %}
5351   ins_pipe( pipe_slow );
5352 %}
5353 
5354 
5355 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
5356   predicate(UseAVX > 0);
5357   match(Set dst (SubVS src1 src2));
5358   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
5359   ins_encode %{
5360     int vlen_enc = vector_length_encoding(this);
5361     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5362   %}
5363   ins_pipe( pipe_slow );
5364 %}
5365 
5366 instruct vsubS_mem(vec dst, vec src, memory mem) %{
5367   predicate((UseAVX > 0) &&
5368             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5369   match(Set dst (SubVS src (LoadVector mem)));
5370   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
5371   ins_encode %{
5372     int vlen_enc = vector_length_encoding(this);
5373     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5374   %}
5375   ins_pipe( pipe_slow );
5376 %}
5377 
5378 // Integers vector sub
5379 instruct vsubI(vec dst, vec src) %{
5380   predicate(UseAVX == 0);
5381   match(Set dst (SubVI dst src));
5382   format %{ "psubd   $dst,$src\t! sub packedI" %}
5383   ins_encode %{
5384     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
5385   %}
5386   ins_pipe( pipe_slow );
5387 %}
5388 
5389 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
5390   predicate(UseAVX > 0);
5391   match(Set dst (SubVI src1 src2));
5392   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
5393   ins_encode %{
5394     int vlen_enc = vector_length_encoding(this);
5395     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5396   %}
5397   ins_pipe( pipe_slow );
5398 %}
5399 
5400 instruct vsubI_mem(vec dst, vec src, memory mem) %{
5401   predicate((UseAVX > 0) &&
5402             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5403   match(Set dst (SubVI src (LoadVector mem)));
5404   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
5405   ins_encode %{
5406     int vlen_enc = vector_length_encoding(this);
5407     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5408   %}
5409   ins_pipe( pipe_slow );
5410 %}
5411 
5412 // Longs vector sub
5413 instruct vsubL(vec dst, vec src) %{
5414   predicate(UseAVX == 0);
5415   match(Set dst (SubVL dst src));
5416   format %{ "psubq   $dst,$src\t! sub packedL" %}
5417   ins_encode %{
5418     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
5419   %}
5420   ins_pipe( pipe_slow );
5421 %}
5422 
5423 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
5424   predicate(UseAVX > 0);
5425   match(Set dst (SubVL src1 src2));
5426   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
5427   ins_encode %{
5428     int vlen_enc = vector_length_encoding(this);
5429     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5430   %}
5431   ins_pipe( pipe_slow );
5432 %}
5433 
5434 
5435 instruct vsubL_mem(vec dst, vec src, memory mem) %{
5436   predicate((UseAVX > 0) &&
5437             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5438   match(Set dst (SubVL src (LoadVector mem)));
5439   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
5440   ins_encode %{
5441     int vlen_enc = vector_length_encoding(this);
5442     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5443   %}
5444   ins_pipe( pipe_slow );
5445 %}
5446 
5447 // Floats vector sub
5448 instruct vsubF(vec dst, vec src) %{
5449   predicate(UseAVX == 0);
5450   match(Set dst (SubVF dst src));
5451   format %{ "subps   $dst,$src\t! sub packedF" %}
5452   ins_encode %{
5453     __ subps($dst$$XMMRegister, $src$$XMMRegister);
5454   %}
5455   ins_pipe( pipe_slow );
5456 %}
5457 
5458 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
5459   predicate(UseAVX > 0);
5460   match(Set dst (SubVF src1 src2));
5461   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
5462   ins_encode %{
5463     int vlen_enc = vector_length_encoding(this);
5464     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5465   %}
5466   ins_pipe( pipe_slow );
5467 %}
5468 
5469 instruct vsubF_mem(vec dst, vec src, memory mem) %{
5470   predicate((UseAVX > 0) &&
5471             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5472   match(Set dst (SubVF src (LoadVector mem)));
5473   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
5474   ins_encode %{
5475     int vlen_enc = vector_length_encoding(this);
5476     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5477   %}
5478   ins_pipe( pipe_slow );
5479 %}
5480 
5481 // Doubles vector sub
5482 instruct vsubD(vec dst, vec src) %{
5483   predicate(UseAVX == 0);
5484   match(Set dst (SubVD dst src));
5485   format %{ "subpd   $dst,$src\t! sub packedD" %}
5486   ins_encode %{
5487     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
5488   %}
5489   ins_pipe( pipe_slow );
5490 %}
5491 
5492 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
5493   predicate(UseAVX > 0);
5494   match(Set dst (SubVD src1 src2));
5495   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
5496   ins_encode %{
5497     int vlen_enc = vector_length_encoding(this);
5498     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5499   %}
5500   ins_pipe( pipe_slow );
5501 %}
5502 
5503 instruct vsubD_mem(vec dst, vec src, memory mem) %{
5504   predicate((UseAVX > 0) &&
5505             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5506   match(Set dst (SubVD src (LoadVector mem)));
5507   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
5508   ins_encode %{
5509     int vlen_enc = vector_length_encoding(this);
5510     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5511   %}
5512   ins_pipe( pipe_slow );
5513 %}
5514 
5515 // --------------------------------- MUL --------------------------------------
5516 
5517 // Byte vector mul
5518 instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5519   predicate(Matcher::vector_length(n) == 4 ||
5520             Matcher::vector_length(n) == 8);
5521   match(Set dst (MulVB src1 src2));
5522   effect(TEMP dst, TEMP tmp, TEMP scratch);
5523   format %{"vector_mulB $dst,$src1,$src2" %}
5524   ins_encode %{
5525     assert(UseSSE > 3, "required");
5526     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
5527     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
5528     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
5529     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5530     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5531     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5532   %}
5533   ins_pipe( pipe_slow );
5534 %}
5535 
5536 instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5537   predicate(Matcher::vector_length(n) == 16 && UseAVX <= 1);
5538   match(Set dst (MulVB src1 src2));
5539   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5540   format %{"vector_mulB $dst,$src1,$src2" %}
5541   ins_encode %{
5542     assert(UseSSE > 3, "required");
5543     __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
5544     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
5545     __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
5546     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
5547     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
5548     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
5549     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
5550     __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
5551     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5552     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5553     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5554     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5555   %}
5556   ins_pipe( pipe_slow );
5557 %}
5558 
5559 instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5560   predicate(Matcher::vector_length(n) == 16 && UseAVX > 1);
5561   match(Set dst (MulVB src1 src2));
5562   effect(TEMP dst, TEMP tmp, TEMP scratch);
5563   format %{"vector_mulB $dst,$src1,$src2" %}
5564   ins_encode %{
5565   int vlen_enc = Assembler::AVX_256bit;
5566     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5567     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5568     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5569     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5570     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5571     __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
5572     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
5573   %}
5574   ins_pipe( pipe_slow );
5575 %}
5576 
5577 instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5578   predicate(Matcher::vector_length(n) == 32);
5579   match(Set dst (MulVB src1 src2));
5580   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5581   format %{"vector_mulB $dst,$src1,$src2" %}
5582   ins_encode %{
5583     assert(UseAVX > 1, "required");
5584     int vlen_enc = Assembler::AVX_256bit;
5585     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5586     __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
5587     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5588     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5589     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5590     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5591     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5592     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5593     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5594     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5595     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5596     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5597     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5598     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
5599   %}
5600   ins_pipe( pipe_slow );
5601 %}
5602 
5603 instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5604   predicate(Matcher::vector_length(n) == 64);
5605   match(Set dst (MulVB src1 src2));
5606   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5607   format %{"vector_mulB $dst,$src1,$src2\n\t" %}
5608   ins_encode %{
5609     assert(UseAVX > 2, "required");
5610     int vlen_enc = Assembler::AVX_512bit;
5611     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5612     __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
5613     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5614     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5615     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5616     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5617     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5618     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5619     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5620     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5621     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5622     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5623     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5624     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
5625     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5626   %}
5627   ins_pipe( pipe_slow );
5628 %}
5629 
5630 // Shorts/Chars vector mul
5631 instruct vmulS(vec dst, vec src) %{
5632   predicate(UseAVX == 0);
5633   match(Set dst (MulVS dst src));
5634   format %{ "pmullw $dst,$src\t! mul packedS" %}
5635   ins_encode %{
5636     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
5637   %}
5638   ins_pipe( pipe_slow );
5639 %}
5640 
5641 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
5642   predicate(UseAVX > 0);
5643   match(Set dst (MulVS src1 src2));
5644   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
5645   ins_encode %{
5646     int vlen_enc = vector_length_encoding(this);
5647     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5648   %}
5649   ins_pipe( pipe_slow );
5650 %}
5651 
5652 instruct vmulS_mem(vec dst, vec src, memory mem) %{
5653   predicate((UseAVX > 0) &&
5654             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5655   match(Set dst (MulVS src (LoadVector mem)));
5656   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
5657   ins_encode %{
5658     int vlen_enc = vector_length_encoding(this);
5659     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5660   %}
5661   ins_pipe( pipe_slow );
5662 %}
5663 
5664 // Integers vector mul
5665 instruct vmulI(vec dst, vec src) %{
5666   predicate(UseAVX == 0);
5667   match(Set dst (MulVI dst src));
5668   format %{ "pmulld  $dst,$src\t! mul packedI" %}
5669   ins_encode %{
5670     assert(UseSSE > 3, "required");
5671     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
5672   %}
5673   ins_pipe( pipe_slow );
5674 %}
5675 
5676 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
5677   predicate(UseAVX > 0);
5678   match(Set dst (MulVI src1 src2));
5679   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
5680   ins_encode %{
5681     int vlen_enc = vector_length_encoding(this);
5682     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5683   %}
5684   ins_pipe( pipe_slow );
5685 %}
5686 
5687 instruct vmulI_mem(vec dst, vec src, memory mem) %{
5688   predicate((UseAVX > 0) &&
5689             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5690   match(Set dst (MulVI src (LoadVector mem)));
5691   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
5692   ins_encode %{
5693     int vlen_enc = vector_length_encoding(this);
5694     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5695   %}
5696   ins_pipe( pipe_slow );
5697 %}
5698 
5699 // Longs vector mul
5700 instruct vmulL_reg(vec dst, vec src1, vec src2) %{
5701   predicate(VM_Version::supports_avx512dq());
5702   match(Set dst (MulVL src1 src2));
5703   format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
5704   ins_encode %{
5705     assert(UseAVX > 2, "required");
5706     int vlen_enc = vector_length_encoding(this);
5707     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5708   %}
5709   ins_pipe( pipe_slow );
5710 %}
5711 
5712 instruct vmulL_mem(vec dst, vec src, memory mem) %{
5713   predicate(VM_Version::supports_avx512dq() &&
5714               (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5715   match(Set dst (MulVL src (LoadVector mem)));
5716   format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
5717   ins_encode %{
5718     assert(UseAVX > 2, "required");
5719     int vlen_enc = vector_length_encoding(this);
5720     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5721   %}
5722   ins_pipe( pipe_slow );
5723 %}
5724 
5725 instruct mul2L_reg(vec dst, vec src2, legVec tmp) %{
5726   predicate(Matcher::vector_length(n) == 2 && !VM_Version::supports_avx512dq());
5727   match(Set dst (MulVL dst src2));
5728   effect(TEMP dst, TEMP tmp);
5729   format %{ "pshufd $tmp,$src2, 177\n\t"
5730             "pmulld $tmp,$dst\n\t"
5731             "phaddd $tmp,$tmp\n\t"
5732             "pmovzxdq $tmp,$tmp\n\t"
5733             "psllq $tmp, 32\n\t"
5734             "pmuludq $dst,$src2\n\t"
5735             "paddq $dst,$tmp\n\t! mul packed2L" %}
5736 
5737   ins_encode %{
5738     assert(VM_Version::supports_sse4_1(), "required");
5739     int vlen_enc = Assembler::AVX_128bit;
5740     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
5741     __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
5742     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5743     __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
5744     __ psllq($tmp$$XMMRegister, 32);
5745     __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
5746     __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
5747   %}
5748   ins_pipe( pipe_slow );
5749 %}
5750 
5751 instruct vmul4L_reg_avx(vec dst, vec src1, vec src2, legVec tmp, legVec tmp1) %{
5752   predicate(Matcher::vector_length(n) == 4 && !VM_Version::supports_avx512dq());
5753   match(Set dst (MulVL src1 src2));
5754   effect(TEMP tmp1, TEMP tmp);
5755   format %{ "vpshufd $tmp,$src2\n\t"
5756             "vpmulld $tmp,$src1,$tmp\n\t"
5757             "vphaddd $tmp,$tmp,$tmp\n\t"
5758             "vpmovzxdq $tmp,$tmp\n\t"
5759             "vpsllq $tmp,$tmp\n\t"
5760             "vpmuludq $tmp1,$src1,$src2\n\t"
5761             "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
5762   ins_encode %{
5763     int vlen_enc = Assembler::AVX_256bit;
5764     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vlen_enc);
5765     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5766     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
5767     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5768     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5769     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vlen_enc);
5770     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5771     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5772   %}
5773   ins_pipe( pipe_slow );
5774 %}
5775 
5776 // Floats vector mul
5777 instruct vmulF(vec dst, vec src) %{
5778   predicate(UseAVX == 0);
5779   match(Set dst (MulVF dst src));
5780   format %{ "mulps   $dst,$src\t! mul packedF" %}
5781   ins_encode %{
5782     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
5783   %}
5784   ins_pipe( pipe_slow );
5785 %}
5786 
5787 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
5788   predicate(UseAVX > 0);
5789   match(Set dst (MulVF src1 src2));
5790   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
5791   ins_encode %{
5792     int vlen_enc = vector_length_encoding(this);
5793     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5794   %}
5795   ins_pipe( pipe_slow );
5796 %}
5797 
5798 instruct vmulF_mem(vec dst, vec src, memory mem) %{
5799   predicate((UseAVX > 0) &&
5800             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5801   match(Set dst (MulVF src (LoadVector mem)));
5802   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
5803   ins_encode %{
5804     int vlen_enc = vector_length_encoding(this);
5805     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5806   %}
5807   ins_pipe( pipe_slow );
5808 %}
5809 
5810 // Doubles vector mul
5811 instruct vmulD(vec dst, vec src) %{
5812   predicate(UseAVX == 0);
5813   match(Set dst (MulVD dst src));
5814   format %{ "mulpd   $dst,$src\t! mul packedD" %}
5815   ins_encode %{
5816     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
5817   %}
5818   ins_pipe( pipe_slow );
5819 %}
5820 
5821 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
5822   predicate(UseAVX > 0);
5823   match(Set dst (MulVD src1 src2));
5824   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
5825   ins_encode %{
5826     int vlen_enc = vector_length_encoding(this);
5827     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5828   %}
5829   ins_pipe( pipe_slow );
5830 %}
5831 
5832 instruct vmulD_mem(vec dst, vec src, memory mem) %{
5833   predicate((UseAVX > 0) &&
5834             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5835   match(Set dst (MulVD src (LoadVector mem)));
5836   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
5837   ins_encode %{
5838     int vlen_enc = vector_length_encoding(this);
5839     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5840   %}
5841   ins_pipe( pipe_slow );
5842 %}
5843 
5844 instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5845   predicate(Matcher::vector_length(n) == 8);
5846   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
5847   effect(TEMP dst, USE src1, USE src2);
5848   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
5849             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
5850          %}
5851   ins_encode %{
5852     assert(UseAVX > 0, "required");
5853 
5854     int vlen_enc = Assembler::AVX_256bit;
5855     int cond = (Assembler::Condition)($copnd$$cmpcode);
5856     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5857     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5858   %}
5859   ins_pipe( pipe_slow );
5860 %}
5861 
5862 instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5863   predicate(Matcher::vector_length(n) == 4);
5864   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
5865   effect(TEMP dst, USE src1, USE src2);
5866   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
5867             "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
5868          %}
5869   ins_encode %{
5870     assert(UseAVX > 0, "required");
5871 
5872     int vlen_enc = Assembler::AVX_256bit;
5873     int cond = (Assembler::Condition)($copnd$$cmpcode);
5874     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5875     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5876   %}
5877   ins_pipe( pipe_slow );
5878 %}
5879 
5880 // --------------------------------- DIV --------------------------------------
5881 
5882 // Floats vector div
5883 instruct vdivF(vec dst, vec src) %{
5884   predicate(UseAVX == 0);
5885   match(Set dst (DivVF dst src));
5886   format %{ "divps   $dst,$src\t! div packedF" %}
5887   ins_encode %{
5888     __ divps($dst$$XMMRegister, $src$$XMMRegister);
5889   %}
5890   ins_pipe( pipe_slow );
5891 %}
5892 
5893 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
5894   predicate(UseAVX > 0);
5895   match(Set dst (DivVF src1 src2));
5896   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
5897   ins_encode %{
5898     int vlen_enc = vector_length_encoding(this);
5899     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5900   %}
5901   ins_pipe( pipe_slow );
5902 %}
5903 
5904 instruct vdivF_mem(vec dst, vec src, memory mem) %{
5905   predicate((UseAVX > 0) &&
5906             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5907   match(Set dst (DivVF src (LoadVector mem)));
5908   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
5909   ins_encode %{
5910     int vlen_enc = vector_length_encoding(this);
5911     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5912   %}
5913   ins_pipe( pipe_slow );
5914 %}
5915 
5916 // Doubles vector div
5917 instruct vdivD(vec dst, vec src) %{
5918   predicate(UseAVX == 0);
5919   match(Set dst (DivVD dst src));
5920   format %{ "divpd   $dst,$src\t! div packedD" %}
5921   ins_encode %{
5922     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
5923   %}
5924   ins_pipe( pipe_slow );
5925 %}
5926 
5927 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
5928   predicate(UseAVX > 0);
5929   match(Set dst (DivVD src1 src2));
5930   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
5931   ins_encode %{
5932     int vlen_enc = vector_length_encoding(this);
5933     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5934   %}
5935   ins_pipe( pipe_slow );
5936 %}
5937 
5938 instruct vdivD_mem(vec dst, vec src, memory mem) %{
5939   predicate((UseAVX > 0) &&
5940             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5941   match(Set dst (DivVD src (LoadVector mem)));
5942   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
5943   ins_encode %{
5944     int vlen_enc = vector_length_encoding(this);
5945     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5946   %}
5947   ins_pipe( pipe_slow );
5948 %}
5949 
5950 // ------------------------------ MinMax ---------------------------------------
5951 
5952 // Byte, Short, Int vector Min/Max
5953 instruct minmax_reg_sse(vec dst, vec src) %{
5954   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5955             UseAVX == 0);
5956   match(Set dst (MinV dst src));
5957   match(Set dst (MaxV dst src));
5958   format %{ "vector_minmax  $dst,$src\t!  " %}
5959   ins_encode %{
5960     assert(UseSSE >= 4, "required");
5961 
5962     int opcode = this->ideal_Opcode();
5963     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5964     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
5965   %}
5966   ins_pipe( pipe_slow );
5967 %}
5968 
5969 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
5970   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5971             UseAVX > 0);
5972   match(Set dst (MinV src1 src2));
5973   match(Set dst (MaxV src1 src2));
5974   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
5975   ins_encode %{
5976     int opcode = this->ideal_Opcode();
5977     int vlen_enc = vector_length_encoding(this);
5978     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5979 
5980     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5981   %}
5982   ins_pipe( pipe_slow );
5983 %}
5984 
5985 // Long vector Min/Max
5986 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
5987   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
5988             UseAVX == 0);
5989   match(Set dst (MinV dst src));
5990   match(Set dst (MaxV src dst));
5991   effect(TEMP dst, TEMP tmp);
5992   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
5993   ins_encode %{
5994     assert(UseSSE >= 4, "required");
5995 
5996     int opcode = this->ideal_Opcode();
5997     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5998     assert(elem_bt == T_LONG, "sanity");
5999 
6000     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
6001   %}
6002   ins_pipe( pipe_slow );
6003 %}
6004 
6005 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
6006   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
6007             UseAVX > 0 && !VM_Version::supports_avx512vl());
6008   match(Set dst (MinV src1 src2));
6009   match(Set dst (MaxV src1 src2));
6010   effect(TEMP dst);
6011   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
6012   ins_encode %{
6013     int vlen_enc = vector_length_encoding(this);
6014     int opcode = this->ideal_Opcode();
6015     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6016     assert(elem_bt == T_LONG, "sanity");
6017 
6018     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6019   %}
6020   ins_pipe( pipe_slow );
6021 %}
6022 
6023 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
6024   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
6025             Matcher::vector_element_basic_type(n) == T_LONG);
6026   match(Set dst (MinV src1 src2));
6027   match(Set dst (MaxV src1 src2));
6028   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
6029   ins_encode %{
6030     assert(UseAVX > 2, "required");
6031 
6032     int vlen_enc = vector_length_encoding(this);
6033     int opcode = this->ideal_Opcode();
6034     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6035     assert(elem_bt == T_LONG, "sanity");
6036 
6037     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6038   %}
6039   ins_pipe( pipe_slow );
6040 %}
6041 
6042 // Float/Double vector Min/Max
6043 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
6044   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
6045             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
6046             UseAVX > 0);
6047   match(Set dst (MinV a b));
6048   match(Set dst (MaxV a b));
6049   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
6050   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
6051   ins_encode %{
6052     assert(UseAVX > 0, "required");
6053 
6054     int opcode = this->ideal_Opcode();
6055     int vlen_enc = vector_length_encoding(this);
6056     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6057 
6058     __ vminmax_fp(opcode, elem_bt,
6059                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
6060                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
6061   %}
6062   ins_pipe( pipe_slow );
6063 %}
6064 
6065 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
6066   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
6067             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
6068   match(Set dst (MinV a b));
6069   match(Set dst (MaxV a b));
6070   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
6071   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
6072   ins_encode %{
6073     assert(UseAVX > 2, "required");
6074 
6075     int opcode = this->ideal_Opcode();
6076     int vlen_enc = vector_length_encoding(this);
6077     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6078 
6079     __ evminmax_fp(opcode, elem_bt,
6080                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
6081                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
6082   %}
6083   ins_pipe( pipe_slow );
6084 %}
6085 
6086 // --------------------------------- Signum/CopySign ---------------------------
6087 
6088 instruct signumF_reg(regF dst, regF zero, regF one, rRegP scratch, rFlagsReg cr) %{
6089   match(Set dst (SignumF dst (Binary zero one)));
6090   effect(TEMP scratch, KILL cr);
6091   format %{ "signumF $dst, $dst\t! using $scratch as TEMP" %}
6092   ins_encode %{
6093     int opcode = this->ideal_Opcode();
6094     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
6095   %}
6096   ins_pipe( pipe_slow );
6097 %}
6098 
6099 instruct signumD_reg(regD dst, regD zero, regD one, rRegP scratch, rFlagsReg cr) %{
6100   match(Set dst (SignumD dst (Binary zero one)));
6101   effect(TEMP scratch, KILL cr);
6102   format %{ "signumD $dst, $dst\t! using $scratch as TEMP" %}
6103   ins_encode %{
6104     int opcode = this->ideal_Opcode();
6105     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
6106   %}
6107   ins_pipe( pipe_slow );
6108 %}
6109 
6110 // ---------------------------------------
6111 // For copySign use 0xE4 as writemask for vpternlog
6112 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
6113 // C (xmm2) is set to 0x7FFFFFFF
6114 // Wherever xmm2 is 0, we want to pick from B (sign)
6115 // Wherever xmm2 is 1, we want to pick from A (src)
6116 //
6117 // A B C Result
6118 // 0 0 0 0
6119 // 0 0 1 0
6120 // 0 1 0 1
6121 // 0 1 1 0
6122 // 1 0 0 0
6123 // 1 0 1 1
6124 // 1 1 0 1
6125 // 1 1 1 1
6126 //
6127 // Result going from high bit to low bit is 0x11100100 = 0xe4
6128 // ---------------------------------------
6129 
6130 #ifdef _LP64
6131 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
6132   match(Set dst (CopySignF dst src));
6133   effect(TEMP tmp1, TEMP tmp2);
6134   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
6135   ins_encode %{
6136     __ movl($tmp2$$Register, 0x7FFFFFFF);
6137     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
6138     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
6139   %}
6140   ins_pipe( pipe_slow );
6141 %}
6142 
6143 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
6144   match(Set dst (CopySignD dst (Binary src zero)));
6145   ins_cost(100);
6146   effect(TEMP tmp1, TEMP tmp2);
6147   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
6148   ins_encode %{
6149     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
6150     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
6151     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
6152   %}
6153   ins_pipe( pipe_slow );
6154 %}
6155 #endif // _LP64
6156 
6157 // --------------------------------- Sqrt --------------------------------------
6158 
6159 instruct vsqrtF_reg(vec dst, vec src) %{
6160   match(Set dst (SqrtVF src));
6161   ins_cost(400);
6162   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
6163   ins_encode %{
6164     assert(UseAVX > 0, "required");
6165     int vlen_enc = vector_length_encoding(this);
6166     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6167   %}
6168   ins_pipe( pipe_slow );
6169 %}
6170 
6171 instruct vsqrtF_mem(vec dst, memory mem) %{
6172   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
6173   match(Set dst (SqrtVF (LoadVector mem)));
6174   ins_cost(400);
6175   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
6176   ins_encode %{
6177     assert(UseAVX > 0, "required");
6178     int vlen_enc = vector_length_encoding(this);
6179     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
6180   %}
6181   ins_pipe( pipe_slow );
6182 %}
6183 
6184 // Floating point vector sqrt
6185 instruct vsqrtD_reg(vec dst, vec src) %{
6186   match(Set dst (SqrtVD src));
6187   ins_cost(400);
6188   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
6189   ins_encode %{
6190     assert(UseAVX > 0, "required");
6191     int vlen_enc = vector_length_encoding(this);
6192     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6193   %}
6194   ins_pipe( pipe_slow );
6195 %}
6196 
6197 instruct vsqrtD_mem(vec dst, memory mem) %{
6198   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
6199   match(Set dst (SqrtVD (LoadVector mem)));
6200   ins_cost(400);
6201   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
6202   ins_encode %{
6203     assert(UseAVX > 0, "required");
6204     int vlen_enc = vector_length_encoding(this);
6205     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
6206   %}
6207   ins_pipe( pipe_slow );
6208 %}
6209 
6210 // ------------------------------ Shift ---------------------------------------
6211 
6212 // Left and right shift count vectors are the same on x86
6213 // (only lowest bits of xmm reg are used for count).
6214 instruct vshiftcnt(vec dst, rRegI cnt) %{
6215   match(Set dst (LShiftCntV cnt));
6216   match(Set dst (RShiftCntV cnt));
6217   format %{ "movdl    $dst,$cnt\t! load shift count" %}
6218   ins_encode %{
6219     __ movdl($dst$$XMMRegister, $cnt$$Register);
6220   %}
6221   ins_pipe( pipe_slow );
6222 %}
6223 
6224 // Byte vector shift
6225 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6226   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
6227   match(Set dst ( LShiftVB src shift));
6228   match(Set dst ( RShiftVB src shift));
6229   match(Set dst (URShiftVB src shift));
6230   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
6231   format %{"vector_byte_shift $dst,$src,$shift" %}
6232   ins_encode %{
6233     assert(UseSSE > 3, "required");
6234     int opcode = this->ideal_Opcode();
6235     bool sign = (opcode != Op_URShiftVB);
6236     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
6237     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
6238     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6239     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
6240     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
6241   %}
6242   ins_pipe( pipe_slow );
6243 %}
6244 
6245 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
6246   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
6247             UseAVX <= 1);
6248   match(Set dst ( LShiftVB src shift));
6249   match(Set dst ( RShiftVB src shift));
6250   match(Set dst (URShiftVB src shift));
6251   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
6252   format %{"vector_byte_shift $dst,$src,$shift" %}
6253   ins_encode %{
6254     assert(UseSSE > 3, "required");
6255     int opcode = this->ideal_Opcode();
6256     bool sign = (opcode != Op_URShiftVB);
6257     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
6258     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
6259     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
6260     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
6261     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
6262     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6263     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
6264     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
6265     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
6266   %}
6267   ins_pipe( pipe_slow );
6268 %}
6269 
6270 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6271   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
6272             UseAVX > 1);
6273   match(Set dst ( LShiftVB src shift));
6274   match(Set dst ( RShiftVB src shift));
6275   match(Set dst (URShiftVB src shift));
6276   effect(TEMP dst, TEMP tmp, TEMP scratch);
6277   format %{"vector_byte_shift $dst,$src,$shift" %}
6278   ins_encode %{
6279     int opcode = this->ideal_Opcode();
6280     bool sign = (opcode != Op_URShiftVB);
6281     int vlen_enc = Assembler::AVX_256bit;
6282     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
6283     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6284     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6285     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
6286     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
6287   %}
6288   ins_pipe( pipe_slow );
6289 %}
6290 
6291 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6292   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
6293   match(Set dst ( LShiftVB src shift));
6294   match(Set dst ( RShiftVB src shift));
6295   match(Set dst (URShiftVB src shift));
6296   effect(TEMP dst, TEMP tmp, TEMP scratch);
6297   format %{"vector_byte_shift $dst,$src,$shift" %}
6298   ins_encode %{
6299     assert(UseAVX > 1, "required");
6300     int opcode = this->ideal_Opcode();
6301     bool sign = (opcode != Op_URShiftVB);
6302     int vlen_enc = Assembler::AVX_256bit;
6303     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
6304     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6305     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6306     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6307     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6308     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6309     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6310     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6311     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6312   %}
6313   ins_pipe( pipe_slow );
6314 %}
6315 
6316 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
6317   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
6318   match(Set dst ( LShiftVB src shift));
6319   match(Set dst  (RShiftVB src shift));
6320   match(Set dst (URShiftVB src shift));
6321   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
6322   format %{"vector_byte_shift $dst,$src,$shift" %}
6323   ins_encode %{
6324     assert(UseAVX > 2, "required");
6325     int opcode = this->ideal_Opcode();
6326     bool sign = (opcode != Op_URShiftVB);
6327     int vlen_enc = Assembler::AVX_512bit;
6328     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
6329     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
6330     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6331     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6332     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6333     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6334     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6335     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6336     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6337     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
6338     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
6339     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6340   %}
6341   ins_pipe( pipe_slow );
6342 %}
6343 
6344 // Shorts vector logical right shift produces incorrect Java result
6345 // for negative data because java code convert short value into int with
6346 // sign extension before a shift. But char vectors are fine since chars are
6347 // unsigned values.
6348 // Shorts/Chars vector left shift
6349 instruct vshiftS(vec dst, vec src, vec shift) %{
6350   predicate(!n->as_ShiftV()->is_var_shift());
6351   match(Set dst ( LShiftVS src shift));
6352   match(Set dst ( RShiftVS src shift));
6353   match(Set dst (URShiftVS src shift));
6354   effect(TEMP dst, USE src, USE shift);
6355   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
6356   ins_encode %{
6357     int opcode = this->ideal_Opcode();
6358     if (UseAVX > 0) {
6359       int vlen_enc = vector_length_encoding(this);
6360       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6361     } else {
6362       int vlen = Matcher::vector_length(this);
6363       if (vlen == 2) {
6364         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6365         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6366       } else if (vlen == 4) {
6367         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6368         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6369       } else {
6370         assert (vlen == 8, "sanity");
6371         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6372         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6373       }
6374     }
6375   %}
6376   ins_pipe( pipe_slow );
6377 %}
6378 
6379 // Integers vector left shift
6380 instruct vshiftI(vec dst, vec src, vec shift) %{
6381   predicate(!n->as_ShiftV()->is_var_shift());
6382   match(Set dst ( LShiftVI src shift));
6383   match(Set dst ( RShiftVI src shift));
6384   match(Set dst (URShiftVI src shift));
6385   effect(TEMP dst, USE src, USE shift);
6386   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
6387   ins_encode %{
6388     int opcode = this->ideal_Opcode();
6389     if (UseAVX > 0) {
6390       int vlen_enc = vector_length_encoding(this);
6391       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6392     } else {
6393       int vlen = Matcher::vector_length(this);
6394       if (vlen == 2) {
6395         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6396         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6397       } else {
6398         assert(vlen == 4, "sanity");
6399         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6400         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6401       }
6402     }
6403   %}
6404   ins_pipe( pipe_slow );
6405 %}
6406 
6407 // Integers vector left constant shift
6408 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
6409   match(Set dst (LShiftVI src (LShiftCntV shift)));
6410   match(Set dst (RShiftVI src (RShiftCntV shift)));
6411   match(Set dst (URShiftVI src (RShiftCntV shift)));
6412   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
6413   ins_encode %{
6414     int opcode = this->ideal_Opcode();
6415     if (UseAVX > 0) {
6416       int vector_len = vector_length_encoding(this);
6417       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6418     } else {
6419       int vlen = Matcher::vector_length(this);
6420       if (vlen == 2) {
6421         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6422         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6423       } else {
6424         assert(vlen == 4, "sanity");
6425         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6426         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6427       }
6428     }
6429   %}
6430   ins_pipe( pipe_slow );
6431 %}
6432 
6433 // Longs vector shift
6434 instruct vshiftL(vec dst, vec src, vec shift) %{
6435   predicate(!n->as_ShiftV()->is_var_shift());
6436   match(Set dst ( LShiftVL src shift));
6437   match(Set dst (URShiftVL src shift));
6438   effect(TEMP dst, USE src, USE shift);
6439   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
6440   ins_encode %{
6441     int opcode = this->ideal_Opcode();
6442     if (UseAVX > 0) {
6443       int vlen_enc = vector_length_encoding(this);
6444       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6445     } else {
6446       assert(Matcher::vector_length(this) == 2, "");
6447       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6448       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6449     }
6450   %}
6451   ins_pipe( pipe_slow );
6452 %}
6453 
6454 // Longs vector constant shift
6455 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
6456   match(Set dst (LShiftVL src (LShiftCntV shift)));
6457   match(Set dst (URShiftVL src (RShiftCntV shift)));
6458   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
6459   ins_encode %{
6460     int opcode = this->ideal_Opcode();
6461     if (UseAVX > 0) {
6462       int vector_len = vector_length_encoding(this);
6463       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6464     } else {
6465       assert(Matcher::vector_length(this) == 2, "");
6466       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6467       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6468     }
6469   %}
6470   ins_pipe( pipe_slow );
6471 %}
6472 
6473 // -------------------ArithmeticRightShift -----------------------------------
6474 // Long vector arithmetic right shift
6475 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6476   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
6477   match(Set dst (RShiftVL src shift));
6478   effect(TEMP dst, TEMP tmp, TEMP scratch);
6479   format %{ "vshiftq $dst,$src,$shift" %}
6480   ins_encode %{
6481     uint vlen = Matcher::vector_length(this);
6482     if (vlen == 2) {
6483       assert(UseSSE >= 2, "required");
6484       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6485       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
6486       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6487       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
6488       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
6489       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
6490     } else {
6491       assert(vlen == 4, "sanity");
6492       assert(UseAVX > 1, "required");
6493       int vlen_enc = Assembler::AVX_256bit;
6494       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6495       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6496       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6497       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6498       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6499     }
6500   %}
6501   ins_pipe( pipe_slow );
6502 %}
6503 
6504 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
6505   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
6506   match(Set dst (RShiftVL src shift));
6507   format %{ "vshiftq $dst,$src,$shift" %}
6508   ins_encode %{
6509     int vlen_enc = vector_length_encoding(this);
6510     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6511   %}
6512   ins_pipe( pipe_slow );
6513 %}
6514 
6515 // ------------------- Variable Shift -----------------------------
6516 // Byte variable shift
6517 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6518   predicate(Matcher::vector_length(n) <= 8 &&
6519             n->as_ShiftV()->is_var_shift() &&
6520             !VM_Version::supports_avx512bw());
6521   match(Set dst ( LShiftVB src shift));
6522   match(Set dst ( RShiftVB src shift));
6523   match(Set dst (URShiftVB src shift));
6524   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6525   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6526   ins_encode %{
6527     assert(UseAVX >= 2, "required");
6528 
6529     int opcode = this->ideal_Opcode();
6530     int vlen_enc = Assembler::AVX_128bit;
6531     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6532     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6533   %}
6534   ins_pipe( pipe_slow );
6535 %}
6536 
6537 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6538   predicate(Matcher::vector_length(n) == 16 &&
6539             n->as_ShiftV()->is_var_shift() &&
6540             !VM_Version::supports_avx512bw());
6541   match(Set dst ( LShiftVB src shift));
6542   match(Set dst ( RShiftVB src shift));
6543   match(Set dst (URShiftVB src shift));
6544   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6545   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6546   ins_encode %{
6547     assert(UseAVX >= 2, "required");
6548 
6549     int opcode = this->ideal_Opcode();
6550     int vlen_enc = Assembler::AVX_128bit;
6551     // Shift lower half and get word result in dst
6552     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6553 
6554     // Shift upper half and get word result in vtmp1
6555     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6556     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6557     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6558 
6559     // Merge and down convert the two word results to byte in dst
6560     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6561   %}
6562   ins_pipe( pipe_slow );
6563 %}
6564 
6565 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{
6566   predicate(Matcher::vector_length(n) == 32 &&
6567             n->as_ShiftV()->is_var_shift() &&
6568             !VM_Version::supports_avx512bw());
6569   match(Set dst ( LShiftVB src shift));
6570   match(Set dst ( RShiftVB src shift));
6571   match(Set dst (URShiftVB src shift));
6572   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch);
6573   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %}
6574   ins_encode %{
6575     assert(UseAVX >= 2, "required");
6576 
6577     int opcode = this->ideal_Opcode();
6578     int vlen_enc = Assembler::AVX_128bit;
6579     // Process lower 128 bits and get result in dst
6580     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6581     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6582     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6583     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6584     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6585 
6586     // Process higher 128 bits and get result in vtmp3
6587     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6588     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6589     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register);
6590     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
6591     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
6592     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6593     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
6594 
6595     // Merge the two results in dst
6596     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6597   %}
6598   ins_pipe( pipe_slow );
6599 %}
6600 
6601 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6602   predicate(Matcher::vector_length(n) <= 32 &&
6603             n->as_ShiftV()->is_var_shift() &&
6604             VM_Version::supports_avx512bw());
6605   match(Set dst ( LShiftVB src shift));
6606   match(Set dst ( RShiftVB src shift));
6607   match(Set dst (URShiftVB src shift));
6608   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6609   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6610   ins_encode %{
6611     assert(UseAVX > 2, "required");
6612 
6613     int opcode = this->ideal_Opcode();
6614     int vlen_enc = vector_length_encoding(this);
6615     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6616   %}
6617   ins_pipe( pipe_slow );
6618 %}
6619 
6620 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6621   predicate(Matcher::vector_length(n) == 64 &&
6622             n->as_ShiftV()->is_var_shift() &&
6623             VM_Version::supports_avx512bw());
6624   match(Set dst ( LShiftVB src shift));
6625   match(Set dst ( RShiftVB src shift));
6626   match(Set dst (URShiftVB src shift));
6627   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6628   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6629   ins_encode %{
6630     assert(UseAVX > 2, "required");
6631 
6632     int opcode = this->ideal_Opcode();
6633     int vlen_enc = Assembler::AVX_256bit;
6634     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6635     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6636     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6637     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6638     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6639   %}
6640   ins_pipe( pipe_slow );
6641 %}
6642 
6643 // Short variable shift
6644 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6645   predicate(Matcher::vector_length(n) <= 8 &&
6646             n->as_ShiftV()->is_var_shift() &&
6647             !VM_Version::supports_avx512bw());
6648   match(Set dst ( LShiftVS src shift));
6649   match(Set dst ( RShiftVS src shift));
6650   match(Set dst (URShiftVS src shift));
6651   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6652   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6653   ins_encode %{
6654     assert(UseAVX >= 2, "required");
6655 
6656     int opcode = this->ideal_Opcode();
6657     bool sign = (opcode != Op_URShiftVS);
6658     int vlen_enc = Assembler::AVX_256bit;
6659     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
6660     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
6661     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
6662     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6663     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
6664     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6665   %}
6666   ins_pipe( pipe_slow );
6667 %}
6668 
6669 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6670   predicate(Matcher::vector_length(n) == 16 &&
6671             n->as_ShiftV()->is_var_shift() &&
6672             !VM_Version::supports_avx512bw());
6673   match(Set dst ( LShiftVS src shift));
6674   match(Set dst ( RShiftVS src shift));
6675   match(Set dst (URShiftVS src shift));
6676   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6677   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6678   ins_encode %{
6679     assert(UseAVX >= 2, "required");
6680 
6681     int opcode = this->ideal_Opcode();
6682     bool sign = (opcode != Op_URShiftVS);
6683     int vlen_enc = Assembler::AVX_256bit;
6684     // Shift lower half, with result in vtmp2 usign vtmp1 as TEMP
6685     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6686     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6687     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6688     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6689 
6690     // Shift upper half, with result in dst usign vtmp1 as TEMP
6691     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
6692     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
6693     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6694     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6695     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6696     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6697 
6698     // Merge lower and upper half result into dst
6699     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6700     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6701   %}
6702   ins_pipe( pipe_slow );
6703 %}
6704 
6705 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
6706   predicate(n->as_ShiftV()->is_var_shift() &&
6707             VM_Version::supports_avx512bw());
6708   match(Set dst ( LShiftVS src shift));
6709   match(Set dst ( RShiftVS src shift));
6710   match(Set dst (URShiftVS src shift));
6711   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
6712   ins_encode %{
6713     assert(UseAVX > 2, "required");
6714 
6715     int opcode = this->ideal_Opcode();
6716     int vlen_enc = vector_length_encoding(this);
6717     if (!VM_Version::supports_avx512vl()) {
6718       vlen_enc = Assembler::AVX_512bit;
6719     }
6720     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6721   %}
6722   ins_pipe( pipe_slow );
6723 %}
6724 
6725 //Integer variable shift
6726 instruct vshiftI_var(vec dst, vec src, vec shift) %{
6727   predicate(n->as_ShiftV()->is_var_shift());
6728   match(Set dst ( LShiftVI src shift));
6729   match(Set dst ( RShiftVI src shift));
6730   match(Set dst (URShiftVI src shift));
6731   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
6732   ins_encode %{
6733     assert(UseAVX >= 2, "required");
6734 
6735     int opcode = this->ideal_Opcode();
6736     int vlen_enc = vector_length_encoding(this);
6737     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6738   %}
6739   ins_pipe( pipe_slow );
6740 %}
6741 
6742 //Long variable shift
6743 instruct vshiftL_var(vec dst, vec src, vec shift) %{
6744   predicate(n->as_ShiftV()->is_var_shift());
6745   match(Set dst ( LShiftVL src shift));
6746   match(Set dst (URShiftVL src shift));
6747   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
6748   ins_encode %{
6749     assert(UseAVX >= 2, "required");
6750 
6751     int opcode = this->ideal_Opcode();
6752     int vlen_enc = vector_length_encoding(this);
6753     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6754   %}
6755   ins_pipe( pipe_slow );
6756 %}
6757 
6758 //Long variable right shift arithmetic
6759 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
6760   predicate(Matcher::vector_length(n) <= 4 &&
6761             n->as_ShiftV()->is_var_shift() &&
6762             UseAVX == 2);
6763   match(Set dst (RShiftVL src shift));
6764   effect(TEMP dst, TEMP vtmp);
6765   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
6766   ins_encode %{
6767     int opcode = this->ideal_Opcode();
6768     int vlen_enc = vector_length_encoding(this);
6769     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
6770                  $vtmp$$XMMRegister);
6771   %}
6772   ins_pipe( pipe_slow );
6773 %}
6774 
6775 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
6776   predicate(n->as_ShiftV()->is_var_shift() &&
6777             UseAVX > 2);
6778   match(Set dst (RShiftVL src shift));
6779   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
6780   ins_encode %{
6781     int opcode = this->ideal_Opcode();
6782     int vlen_enc = vector_length_encoding(this);
6783     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6784   %}
6785   ins_pipe( pipe_slow );
6786 %}
6787 
6788 // --------------------------------- AND --------------------------------------
6789 
6790 instruct vand(vec dst, vec src) %{
6791   predicate(UseAVX == 0);
6792   match(Set dst (AndV dst src));
6793   format %{ "pand    $dst,$src\t! and vectors" %}
6794   ins_encode %{
6795     __ pand($dst$$XMMRegister, $src$$XMMRegister);
6796   %}
6797   ins_pipe( pipe_slow );
6798 %}
6799 
6800 instruct vand_reg(vec dst, vec src1, vec src2) %{
6801   predicate(UseAVX > 0);
6802   match(Set dst (AndV src1 src2));
6803   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
6804   ins_encode %{
6805     int vlen_enc = vector_length_encoding(this);
6806     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6807   %}
6808   ins_pipe( pipe_slow );
6809 %}
6810 
6811 instruct vand_mem(vec dst, vec src, memory mem) %{
6812   predicate((UseAVX > 0) &&
6813             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6814   match(Set dst (AndV src (LoadVector mem)));
6815   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
6816   ins_encode %{
6817     int vlen_enc = vector_length_encoding(this);
6818     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6819   %}
6820   ins_pipe( pipe_slow );
6821 %}
6822 
6823 // --------------------------------- OR ---------------------------------------
6824 
6825 instruct vor(vec dst, vec src) %{
6826   predicate(UseAVX == 0);
6827   match(Set dst (OrV dst src));
6828   format %{ "por     $dst,$src\t! or vectors" %}
6829   ins_encode %{
6830     __ por($dst$$XMMRegister, $src$$XMMRegister);
6831   %}
6832   ins_pipe( pipe_slow );
6833 %}
6834 
6835 instruct vor_reg(vec dst, vec src1, vec src2) %{
6836   predicate(UseAVX > 0);
6837   match(Set dst (OrV src1 src2));
6838   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
6839   ins_encode %{
6840     int vlen_enc = vector_length_encoding(this);
6841     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6842   %}
6843   ins_pipe( pipe_slow );
6844 %}
6845 
6846 instruct vor_mem(vec dst, vec src, memory mem) %{
6847   predicate((UseAVX > 0) &&
6848             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6849   match(Set dst (OrV src (LoadVector mem)));
6850   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
6851   ins_encode %{
6852     int vlen_enc = vector_length_encoding(this);
6853     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6854   %}
6855   ins_pipe( pipe_slow );
6856 %}
6857 
6858 // --------------------------------- XOR --------------------------------------
6859 
6860 instruct vxor(vec dst, vec src) %{
6861   predicate(UseAVX == 0);
6862   match(Set dst (XorV dst src));
6863   format %{ "pxor    $dst,$src\t! xor vectors" %}
6864   ins_encode %{
6865     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
6866   %}
6867   ins_pipe( pipe_slow );
6868 %}
6869 
6870 instruct vxor_reg(vec dst, vec src1, vec src2) %{
6871   predicate(UseAVX > 0);
6872   match(Set dst (XorV src1 src2));
6873   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
6874   ins_encode %{
6875     int vlen_enc = vector_length_encoding(this);
6876     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6877   %}
6878   ins_pipe( pipe_slow );
6879 %}
6880 
6881 instruct vxor_mem(vec dst, vec src, memory mem) %{
6882   predicate((UseAVX > 0) &&
6883             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6884   match(Set dst (XorV src (LoadVector mem)));
6885   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
6886   ins_encode %{
6887     int vlen_enc = vector_length_encoding(this);
6888     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6889   %}
6890   ins_pipe( pipe_slow );
6891 %}
6892 
6893 // --------------------------------- VectorCast --------------------------------------
6894 
6895 instruct vcastBtoX(vec dst, vec src) %{
6896   match(Set dst (VectorCastB2X src));
6897   format %{ "vector_cast_b2x $dst,$src\t!" %}
6898   ins_encode %{
6899     assert(UseAVX > 0, "required");
6900 
6901     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6902     int vlen_enc = vector_length_encoding(this);
6903     switch (to_elem_bt) {
6904       case T_SHORT:
6905         __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6906         break;
6907       case T_INT:
6908         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6909         break;
6910       case T_FLOAT:
6911         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6912         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6913         break;
6914       case T_LONG:
6915         __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6916         break;
6917       case T_DOUBLE: {
6918         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
6919         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
6920         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6921         break;
6922       }
6923       default: assert(false, "%s", type2name(to_elem_bt));
6924     }
6925   %}
6926   ins_pipe( pipe_slow );
6927 %}
6928 
6929 instruct castStoX(vec dst, vec src, rRegP scratch) %{
6930   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6931             Matcher::vector_length(n->in(1)) <= 8 && // src
6932             Matcher::vector_element_basic_type(n) == T_BYTE);
6933   effect(TEMP scratch);
6934   match(Set dst (VectorCastS2X src));
6935   format %{ "vector_cast_s2x $dst,$src\t! using $scratch as TEMP" %}
6936   ins_encode %{
6937     assert(UseAVX > 0, "required");
6938 
6939     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, $scratch$$Register);
6940     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6941   %}
6942   ins_pipe( pipe_slow );
6943 %}
6944 
6945 instruct vcastStoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6946   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6947             Matcher::vector_length(n->in(1)) == 16 && // src
6948             Matcher::vector_element_basic_type(n) == T_BYTE);
6949   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6950   match(Set dst (VectorCastS2X src));
6951   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp, $scratch as TEMP" %}
6952   ins_encode %{
6953     assert(UseAVX > 0, "required");
6954 
6955     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
6956     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6957     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
6958     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6959   %}
6960   ins_pipe( pipe_slow );
6961 %}
6962 
6963 instruct vcastStoX_evex(vec dst, vec src) %{
6964   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
6965             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
6966   match(Set dst (VectorCastS2X src));
6967   format %{ "vector_cast_s2x $dst,$src\t!" %}
6968   ins_encode %{
6969     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6970     int src_vlen_enc = vector_length_encoding(this, $src);
6971     int vlen_enc = vector_length_encoding(this);
6972     switch (to_elem_bt) {
6973       case T_BYTE:
6974         if (!VM_Version::supports_avx512vl()) {
6975           vlen_enc = Assembler::AVX_512bit;
6976         }
6977         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6978         break;
6979       case T_INT:
6980         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6981         break;
6982       case T_FLOAT:
6983         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6984         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6985         break;
6986       case T_LONG:
6987         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6988         break;
6989       case T_DOUBLE: {
6990         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
6991         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
6992         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6993         break;
6994       }
6995       default:
6996         ShouldNotReachHere();
6997     }
6998   %}
6999   ins_pipe( pipe_slow );
7000 %}
7001 
7002 instruct castItoX(vec dst, vec src, rRegP scratch) %{
7003   predicate(UseAVX <= 2 &&
7004             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
7005             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
7006   match(Set dst (VectorCastI2X src));
7007   format %{ "vector_cast_i2x $dst,$src\t! using $scratch as TEMP" %}
7008   effect(TEMP scratch);
7009   ins_encode %{
7010     assert(UseAVX > 0, "required");
7011 
7012     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7013     int vlen_enc = vector_length_encoding(this, $src);
7014 
7015     if (to_elem_bt == T_BYTE) {
7016       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
7017       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7018       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7019     } else {
7020       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
7021       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
7022       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7023     }
7024   %}
7025   ins_pipe( pipe_slow );
7026 %}
7027 
7028 instruct vcastItoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
7029   predicate(UseAVX <= 2 &&
7030             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
7031             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
7032   match(Set dst (VectorCastI2X src));
7033   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp and $scratch as TEMP" %}
7034   effect(TEMP dst, TEMP vtmp, TEMP scratch);
7035   ins_encode %{
7036     assert(UseAVX > 0, "required");
7037 
7038     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7039     int vlen_enc = vector_length_encoding(this, $src);
7040 
7041     if (to_elem_bt == T_BYTE) {
7042       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
7043       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
7044       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7045       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7046     } else {
7047       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
7048       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
7049       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
7050       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7051     }
7052   %}
7053   ins_pipe( pipe_slow );
7054 %}
7055 
7056 instruct vcastItoX_evex(vec dst, vec src) %{
7057   predicate(UseAVX > 2 ||
7058             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
7059   match(Set dst (VectorCastI2X src));
7060   format %{ "vector_cast_i2x $dst,$src\t!" %}
7061   ins_encode %{
7062     assert(UseAVX > 0, "required");
7063 
7064     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
7065     int src_vlen_enc = vector_length_encoding(this, $src);
7066     int dst_vlen_enc = vector_length_encoding(this);
7067     switch (dst_elem_bt) {
7068       case T_BYTE:
7069         if (!VM_Version::supports_avx512vl()) {
7070           src_vlen_enc = Assembler::AVX_512bit;
7071         }
7072         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7073         break;
7074       case T_SHORT:
7075         if (!VM_Version::supports_avx512vl()) {
7076           src_vlen_enc = Assembler::AVX_512bit;
7077         }
7078         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7079         break;
7080       case T_FLOAT:
7081         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7082         break;
7083       case T_LONG:
7084         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7085         break;
7086       case T_DOUBLE:
7087         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7088         break;
7089       default:
7090         ShouldNotReachHere();
7091     }
7092   %}
7093   ins_pipe( pipe_slow );
7094 %}
7095 
7096 instruct vcastLtoBS(vec dst, vec src, rRegP scratch) %{
7097   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
7098             UseAVX <= 2);
7099   match(Set dst (VectorCastL2X src));
7100   effect(TEMP scratch);
7101   format %{ "vector_cast_l2x  $dst,$src\t! using $scratch as TEMP" %}
7102   ins_encode %{
7103     assert(UseAVX > 0, "required");
7104 
7105     int vlen = Matcher::vector_length_in_bytes(this, $src);
7106     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
7107     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
7108                                                       : ExternalAddress(vector_int_to_short_mask());
7109     if (vlen <= 16) {
7110       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
7111       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
7112       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7113     } else {
7114       assert(vlen <= 32, "required");
7115       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
7116       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
7117       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
7118       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7119     }
7120     if (to_elem_bt == T_BYTE) {
7121       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7122     }
7123   %}
7124   ins_pipe( pipe_slow );
7125 %}
7126 
7127 instruct vcastLtoX_evex(vec dst, vec src) %{
7128   predicate(UseAVX > 2 ||
7129             (Matcher::vector_element_basic_type(n) == T_INT ||
7130              Matcher::vector_element_basic_type(n) == T_FLOAT ||
7131              Matcher::vector_element_basic_type(n) == T_DOUBLE));
7132   match(Set dst (VectorCastL2X src));
7133   format %{ "vector_cast_l2x  $dst,$src\t!" %}
7134   ins_encode %{
7135     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7136     int vlen = Matcher::vector_length_in_bytes(this, $src);
7137     int vlen_enc = vector_length_encoding(this, $src);
7138     switch (to_elem_bt) {
7139       case T_BYTE:
7140         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
7141           vlen_enc = Assembler::AVX_512bit;
7142         }
7143         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7144         break;
7145       case T_SHORT:
7146         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
7147           vlen_enc = Assembler::AVX_512bit;
7148         }
7149         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7150         break;
7151       case T_INT:
7152         if (vlen == 8) {
7153           if ($dst$$XMMRegister != $src$$XMMRegister) {
7154             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
7155           }
7156         } else if (vlen == 16) {
7157           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
7158         } else if (vlen == 32) {
7159           if (UseAVX > 2) {
7160             if (!VM_Version::supports_avx512vl()) {
7161               vlen_enc = Assembler::AVX_512bit;
7162             }
7163             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7164           } else {
7165             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
7166             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
7167           }
7168         } else { // vlen == 64
7169           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7170         }
7171         break;
7172       case T_FLOAT:
7173         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
7174         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7175         break;
7176       case T_DOUBLE:
7177         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
7178         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7179         break;
7180 
7181       default: assert(false, "%s", type2name(to_elem_bt));
7182     }
7183   %}
7184   ins_pipe( pipe_slow );
7185 %}
7186 
7187 instruct vcastFtoD_reg(vec dst, vec src) %{
7188   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
7189   match(Set dst (VectorCastF2X src));
7190   format %{ "vector_cast_f2d  $dst,$src\t!" %}
7191   ins_encode %{
7192     int vlen_enc = vector_length_encoding(this);
7193     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7194   %}
7195   ins_pipe( pipe_slow );
7196 %}
7197 
7198 instruct vcastFtoI_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rRegP scratch, rFlagsReg cr) %{
7199   predicate(!VM_Version::supports_avx512vl() &&
7200             Matcher::vector_length_in_bytes(n) < 64 &&
7201             Matcher::vector_element_basic_type(n) == T_INT);
7202   match(Set dst (VectorCastF2X src));
7203   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP scratch, KILL cr);
7204   format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
7205   ins_encode %{
7206     int vlen_enc = vector_length_encoding(this);
7207     __ vector_castF2I_avx($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7208                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
7209                           ExternalAddress(vector_float_signflip()), $scratch$$Register, vlen_enc);
7210   %}
7211   ins_pipe( pipe_slow );
7212 %}
7213 
7214 instruct vcastFtoI_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
7215   predicate((VM_Version::supports_avx512vl() ||
7216              Matcher::vector_length_in_bytes(n) == 64) &&
7217              Matcher::vector_element_basic_type(n) == T_INT);
7218   match(Set dst (VectorCastF2X src));
7219   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr);
7220   format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
7221   ins_encode %{
7222     int vlen_enc = vector_length_encoding(this);
7223     __ vector_castF2I_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7224                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
7225                            ExternalAddress(vector_float_signflip()), $scratch$$Register, vlen_enc);
7226   %}
7227   ins_pipe( pipe_slow );
7228 %}
7229 
7230 instruct vcastDtoF_reg(vec dst, vec src) %{
7231   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
7232   match(Set dst (VectorCastD2X src));
7233   format %{ "vector_cast_d2x  $dst,$src\t!" %}
7234   ins_encode %{
7235     int vlen_enc = vector_length_encoding(this, $src);
7236     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7237   %}
7238   ins_pipe( pipe_slow );
7239 %}
7240 
7241 instruct vcastDtoL_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
7242   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
7243   match(Set dst (VectorCastD2X src));
7244   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr);
7245   format %{ "vector_cast_d2l $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
7246   ins_encode %{
7247     int vlen_enc = vector_length_encoding(this);
7248     __ vector_castD2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7249                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
7250                            ExternalAddress(vector_double_signflip()), $scratch$$Register, vlen_enc);
7251   %}
7252   ins_pipe( pipe_slow );
7253 %}
7254 
7255 // --------------------------------- VectorMaskCmp --------------------------------------
7256 
7257 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
7258   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7259             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
7260             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7261             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7262   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7263   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
7264   ins_encode %{
7265     int vlen_enc = vector_length_encoding(this, $src1);
7266     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7267     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7268       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7269     } else {
7270       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7271     }
7272   %}
7273   ins_pipe( pipe_slow );
7274 %}
7275 
7276 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
7277   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
7278             n->bottom_type()->isa_vectmask() == NULL &&
7279             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7280   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7281   effect(TEMP scratch, TEMP ktmp);
7282   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7283   ins_encode %{
7284     int vlen_enc = Assembler::AVX_512bit;
7285     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7286     KRegister mask = k0; // The comparison itself is not being masked.
7287     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7288       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7289       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
7290     } else {
7291       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7292       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
7293     }
7294   %}
7295   ins_pipe( pipe_slow );
7296 %}
7297 
7298 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
7299   predicate(n->bottom_type()->isa_vectmask() &&
7300             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7301   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7302   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
7303   ins_encode %{
7304     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
7305     int vlen_enc = vector_length_encoding(this, $src1);
7306     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7307     KRegister mask = k0; // The comparison itself is not being masked.
7308     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7309       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7310     } else {
7311       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7312     }
7313   %}
7314   ins_pipe( pipe_slow );
7315 %}
7316 
7317 instruct vcmp(legVec dst, legVec src1, legVec src2, immI8 cond, rRegP scratch) %{
7318   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7319             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7320             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
7321             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7322             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7323   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7324   effect(TEMP scratch);
7325   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7326   ins_encode %{
7327     int vlen_enc = vector_length_encoding(this, $src1);
7328     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7329     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
7330     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, ww, vlen_enc, $scratch$$Register);
7331   %}
7332   ins_pipe( pipe_slow );
7333 %}
7334 
7335 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
7336   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7337             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7338             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
7339             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 16 && // src1
7340             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7341   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7342   effect(TEMP vtmp1, TEMP vtmp2, TEMP scratch);
7343   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7344   ins_encode %{
7345     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7346     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7347     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
7348     __ vpcmpu(bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen, $vtmp1$$XMMRegister,
7349               $vtmp2$$XMMRegister, $scratch$$Register);
7350   %}
7351   ins_pipe( pipe_slow );
7352 %}
7353 
7354 instruct vcmpu32(legVec dst, legVec src1, legVec src2, immI8 cond, legVec vtmp1, legVec vtmp2, legVec vtmp3, rRegP scratch) %{
7355   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7356             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7357             Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 32 && // src1
7358             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7359   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7360   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP scratch);
7361   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7362   ins_encode %{
7363     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7364     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7365     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
7366     __ vpcmpu32(bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen, $vtmp1$$XMMRegister,
7367                 $vtmp2$$XMMRegister, $vtmp3$$XMMRegister, $scratch$$Register);
7368   %}
7369   ins_pipe( pipe_slow );
7370 %}
7371 
7372 instruct vcmpu64(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
7373   predicate((n->bottom_type()->isa_vectmask() == NULL &&
7374              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
7375              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7376   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7377   effect(TEMP scratch, TEMP ktmp);
7378   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7379   ins_encode %{
7380     assert(UseAVX > 2, "required");
7381 
7382     int vlen_enc = vector_length_encoding(this, $src1);
7383     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7384     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
7385     KRegister mask = k0; // The comparison itself is not being masked.
7386     bool merge = false;
7387     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
7388 
7389     switch (src1_elem_bt) {
7390       case T_INT: {
7391         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7392         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7393         break;
7394       }
7395       case T_LONG: {
7396         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7397         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7398         break;
7399       }
7400       default: assert(false, "%s", type2name(src1_elem_bt));
7401     }
7402   %}
7403   ins_pipe( pipe_slow );
7404 %}
7405 
7406 
7407 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
7408   predicate(n->bottom_type()->isa_vectmask() &&
7409             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7410   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7411   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
7412   ins_encode %{
7413     assert(UseAVX > 2, "required");
7414     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
7415 
7416     int vlen_enc = vector_length_encoding(this, $src1);
7417     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7418     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
7419     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
7420 
7421     // Comparison i
7422     switch (src1_elem_bt) {
7423       case T_BYTE: {
7424         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7425         break;
7426       }
7427       case T_SHORT: {
7428         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7429         break;
7430       }
7431       case T_INT: {
7432         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7433         break;
7434       }
7435       case T_LONG: {
7436         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7437         break;
7438       }
7439       default: assert(false, "%s", type2name(src1_elem_bt));
7440     }
7441   %}
7442   ins_pipe( pipe_slow );
7443 %}
7444 
7445 // Extract
7446 
7447 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
7448   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
7449   match(Set dst (ExtractI src idx));
7450   match(Set dst (ExtractS src idx));
7451 #ifdef _LP64
7452   match(Set dst (ExtractB src idx));
7453 #endif
7454   format %{ "extractI $dst,$src,$idx\t!" %}
7455   ins_encode %{
7456     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7457 
7458     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7459     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7460   %}
7461   ins_pipe( pipe_slow );
7462 %}
7463 
7464 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
7465   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
7466             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
7467   match(Set dst (ExtractI src idx));
7468   match(Set dst (ExtractS src idx));
7469 #ifdef _LP64
7470   match(Set dst (ExtractB src idx));
7471 #endif
7472   effect(TEMP vtmp);
7473   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
7474   ins_encode %{
7475     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7476 
7477     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7478     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7479     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
7480   %}
7481   ins_pipe( pipe_slow );
7482 %}
7483 
7484 #ifdef _LP64
7485 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
7486   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
7487   match(Set dst (ExtractL src idx));
7488   format %{ "extractL $dst,$src,$idx\t!" %}
7489   ins_encode %{
7490     assert(UseSSE >= 4, "required");
7491     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7492 
7493     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7494   %}
7495   ins_pipe( pipe_slow );
7496 %}
7497 
7498 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
7499   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7500             Matcher::vector_length(n->in(1)) == 8);  // src
7501   match(Set dst (ExtractL src idx));
7502   effect(TEMP vtmp);
7503   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
7504   ins_encode %{
7505     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7506 
7507     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7508     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
7509   %}
7510   ins_pipe( pipe_slow );
7511 %}
7512 #endif
7513 
7514 instruct extractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7515   predicate(Matcher::vector_length(n->in(1)) <= 4);
7516   match(Set dst (ExtractF src idx));
7517   effect(TEMP dst, TEMP tmp, TEMP vtmp);
7518   format %{ "extractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7519   ins_encode %{
7520     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7521 
7522     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $tmp$$Register, $vtmp$$XMMRegister);
7523   %}
7524   ins_pipe( pipe_slow );
7525 %}
7526 
7527 instruct vextractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7528   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
7529             Matcher::vector_length(n->in(1)/*src*/) == 16);
7530   match(Set dst (ExtractF src idx));
7531   effect(TEMP tmp, TEMP vtmp);
7532   format %{ "vextractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7533   ins_encode %{
7534     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7535 
7536     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7537     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant, $tmp$$Register);
7538   %}
7539   ins_pipe( pipe_slow );
7540 %}
7541 
7542 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
7543   predicate(Matcher::vector_length(n->in(1)) == 2); // src
7544   match(Set dst (ExtractD src idx));
7545   format %{ "extractD $dst,$src,$idx\t!" %}
7546   ins_encode %{
7547     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7548 
7549     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7550   %}
7551   ins_pipe( pipe_slow );
7552 %}
7553 
7554 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
7555   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7556             Matcher::vector_length(n->in(1)) == 8);  // src
7557   match(Set dst (ExtractD src idx));
7558   effect(TEMP vtmp);
7559   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
7560   ins_encode %{
7561     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7562 
7563     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7564     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
7565   %}
7566   ins_pipe( pipe_slow );
7567 %}
7568 
7569 // --------------------------------- Vector Blend --------------------------------------
7570 
7571 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
7572   predicate(UseAVX == 0);
7573   match(Set dst (VectorBlend (Binary dst src) mask));
7574   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
7575   effect(TEMP tmp);
7576   ins_encode %{
7577     assert(UseSSE >= 4, "required");
7578 
7579     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
7580       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
7581     }
7582     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
7583   %}
7584   ins_pipe( pipe_slow );
7585 %}
7586 
7587 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
7588   predicate(UseAVX > 0 &&
7589             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
7590             Matcher::vector_length_in_bytes(n) <= 32 &&
7591             is_integral_type(Matcher::vector_element_basic_type(n)));
7592   match(Set dst (VectorBlend (Binary src1 src2) mask));
7593   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7594   ins_encode %{
7595     int vlen_enc = vector_length_encoding(this);
7596     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7597   %}
7598   ins_pipe( pipe_slow );
7599 %}
7600 
7601 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
7602   predicate(UseAVX > 0 &&
7603             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
7604             Matcher::vector_length_in_bytes(n) <= 32 &&
7605             !is_integral_type(Matcher::vector_element_basic_type(n)));
7606   match(Set dst (VectorBlend (Binary src1 src2) mask));
7607   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7608   ins_encode %{
7609     int vlen_enc = vector_length_encoding(this);
7610     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7611   %}
7612   ins_pipe( pipe_slow );
7613 %}
7614 
7615 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, rRegP scratch, kReg ktmp) %{
7616   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
7617             n->in(2)->bottom_type()->isa_vectmask() == NULL);
7618   match(Set dst (VectorBlend (Binary src1 src2) mask));
7619   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
7620   effect(TEMP scratch, TEMP ktmp);
7621   ins_encode %{
7622      int vlen_enc = Assembler::AVX_512bit;
7623      BasicType elem_bt = Matcher::vector_element_basic_type(this);
7624     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, $scratch$$Register);
7625     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
7626   %}
7627   ins_pipe( pipe_slow );
7628 %}
7629 
7630 
7631 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask, rRegP scratch) %{
7632   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
7633             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
7634              VM_Version::supports_avx512bw()));
7635   match(Set dst (VectorBlend (Binary src1 src2) mask));
7636   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
7637   effect(TEMP scratch);
7638   ins_encode %{
7639     int vlen_enc = vector_length_encoding(this);
7640     BasicType elem_bt = Matcher::vector_element_basic_type(this);
7641     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
7642   %}
7643   ins_pipe( pipe_slow );
7644 %}
7645 
7646 // --------------------------------- ABS --------------------------------------
7647 // a = |a|
7648 instruct vabsB_reg(vec dst, vec src) %{
7649   match(Set dst (AbsVB  src));
7650   ins_cost(450);
7651   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
7652   ins_encode %{
7653     uint vlen = Matcher::vector_length(this);
7654     if (vlen <= 16) {
7655       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7656     } else {
7657       int vlen_enc = vector_length_encoding(this);
7658       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7659     }
7660   %}
7661   ins_pipe( pipe_slow );
7662 %}
7663 
7664 instruct vabsS_reg(vec dst, vec src) %{
7665   match(Set dst (AbsVS  src));
7666   ins_cost(450);
7667   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
7668   ins_encode %{
7669     uint vlen = Matcher::vector_length(this);
7670     if (vlen <= 8) {
7671       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7672     } else {
7673       int vlen_enc = vector_length_encoding(this);
7674       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7675     }
7676   %}
7677   ins_pipe( pipe_slow );
7678 %}
7679 
7680 instruct vabsI_reg(vec dst, vec src) %{
7681   match(Set dst (AbsVI  src));
7682   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
7683   ins_cost(250);
7684   ins_encode %{
7685     uint vlen = Matcher::vector_length(this);
7686     if (vlen <= 4) {
7687       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7688     } else {
7689       int vlen_enc = vector_length_encoding(this);
7690       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7691     }
7692   %}
7693   ins_pipe( pipe_slow );
7694 %}
7695 
7696 instruct vabsL_reg(vec dst, vec src) %{
7697   match(Set dst (AbsVL  src));
7698   ins_cost(450);
7699   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
7700   ins_encode %{
7701     assert(UseAVX > 2, "required");
7702     int vlen_enc = vector_length_encoding(this);
7703     if (!VM_Version::supports_avx512vl()) {
7704       vlen_enc = Assembler::AVX_512bit;
7705     }
7706     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7707   %}
7708   ins_pipe( pipe_slow );
7709 %}
7710 
7711 // --------------------------------- ABSNEG --------------------------------------
7712 
7713 instruct vabsnegF(vec dst, vec src, rRegI scratch) %{
7714   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
7715   match(Set dst (AbsVF src));
7716   match(Set dst (NegVF src));
7717   effect(TEMP scratch);
7718   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
7719   ins_cost(150);
7720   ins_encode %{
7721     int opcode = this->ideal_Opcode();
7722     int vlen = Matcher::vector_length(this);
7723     if (vlen == 2) {
7724       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7725     } else {
7726       assert(vlen == 8 || vlen == 16, "required");
7727       int vlen_enc = vector_length_encoding(this);
7728       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7729     }
7730   %}
7731   ins_pipe( pipe_slow );
7732 %}
7733 
7734 instruct vabsneg4F(vec dst, rRegI scratch) %{
7735   predicate(Matcher::vector_length(n) == 4);
7736   match(Set dst (AbsVF dst));
7737   match(Set dst (NegVF dst));
7738   effect(TEMP scratch);
7739   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
7740   ins_cost(150);
7741   ins_encode %{
7742     int opcode = this->ideal_Opcode();
7743     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $scratch$$Register);
7744   %}
7745   ins_pipe( pipe_slow );
7746 %}
7747 
7748 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
7749   match(Set dst (AbsVD  src));
7750   match(Set dst (NegVD  src));
7751   effect(TEMP scratch);
7752   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
7753   ins_encode %{
7754     int opcode = this->ideal_Opcode();
7755     uint vlen = Matcher::vector_length(this);
7756     if (vlen == 2) {
7757       assert(UseSSE >= 2, "required");
7758       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7759     } else {
7760       int vlen_enc = vector_length_encoding(this);
7761       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7762     }
7763   %}
7764   ins_pipe( pipe_slow );
7765 %}
7766 
7767 //------------------------------------- VectorTest --------------------------------------------
7768 
7769 #ifdef _LP64
7770 instruct vptest_alltrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp1, legVec vtmp2, rFlagsReg cr) %{
7771   predicate(!VM_Version::supports_avx512bwdq() &&
7772             Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
7773             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
7774             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7775   match(Set dst (VectorTest src1 src2 ));
7776   effect(TEMP vtmp1, TEMP vtmp2, KILL cr);
7777   format %{ "vptest_alltrue_lt16 $dst,$src1, $src2\t! using $vtmp1, $vtmp2 and $cr as TEMP" %}
7778   ins_encode %{
7779     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7780     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
7781     __ setb(Assembler::carrySet, $dst$$Register);
7782     __ movzbl($dst$$Register, $dst$$Register);
7783   %}
7784   ins_pipe( pipe_slow );
7785 %}
7786 
7787 instruct vptest_alltrue_ge16(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7788   predicate(!VM_Version::supports_avx512bwdq() &&
7789             Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
7790             Matcher::vector_length_in_bytes(n->in(1)) <  64 &&
7791             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7792   match(Set dst (VectorTest src1 src2 ));
7793   effect(KILL cr);
7794   format %{ "vptest_alltrue_ge16  $dst,$src1, $src2\t! using $cr as TEMP" %}
7795   ins_encode %{
7796     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7797     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7798     __ setb(Assembler::carrySet, $dst$$Register);
7799     __ movzbl($dst$$Register, $dst$$Register);
7800   %}
7801   ins_pipe( pipe_slow );
7802 %}
7803 
7804 instruct vptest_alltrue_lt8_evex(rRegI dst, kReg src1, kReg src2, kReg kscratch, rFlagsReg cr) %{
7805   predicate(VM_Version::supports_avx512bwdq() &&
7806             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow &&
7807             n->in(1)->bottom_type()->isa_vectmask() &&
7808             Matcher::vector_length(n->in(1)) < 8);
7809   match(Set dst (VectorTest src1 src2));
7810   effect(KILL cr, TEMP kscratch);
7811   format %{ "vptest_alltrue_lt8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
7812   ins_encode %{
7813     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7814     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7815     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7816     uint masklen = Matcher::vector_length(this, $src1);
7817     __ alltrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister, $kscratch$$KRegister);
7818   %}
7819   ins_pipe( pipe_slow );
7820 %}
7821 
7822 
7823 instruct vptest_alltrue_ge8_evex(rRegI dst, kReg src1, kReg src2, rFlagsReg cr) %{
7824   predicate(VM_Version::supports_avx512bwdq() &&
7825             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow &&
7826             n->in(1)->bottom_type()->isa_vectmask() &&
7827             Matcher::vector_length(n->in(1)) >= 8);
7828   match(Set dst (VectorTest src1 src2));
7829   effect(KILL cr);
7830   format %{ "vptest_alltrue_ge8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
7831   ins_encode %{
7832     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7833     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7834     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7835     uint masklen = Matcher::vector_length(this, $src1);
7836     __ alltrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister, knoreg);
7837   %}
7838   ins_pipe( pipe_slow );
7839 %}
7840 
7841 
7842 instruct vptest_anytrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp, rFlagsReg cr) %{
7843   predicate(!VM_Version::supports_avx512bwdq() &&
7844             Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
7845             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
7846             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7847   match(Set dst (VectorTest src1 src2 ));
7848   effect(TEMP vtmp, KILL cr);
7849   format %{ "vptest_anytrue_lt16 $dst,$src1,$src2\t! using $vtmp, $cr as TEMP" %}
7850   ins_encode %{
7851     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7852     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7853     __ setb(Assembler::notZero, $dst$$Register);
7854     __ movzbl($dst$$Register, $dst$$Register);
7855   %}
7856   ins_pipe( pipe_slow );
7857 %}
7858 
7859 instruct vptest_anytrue_ge16(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7860   predicate(!VM_Version::supports_avx512bwdq() &&
7861             Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
7862             Matcher::vector_length_in_bytes(n->in(1)) < 64  &&
7863             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7864   match(Set dst (VectorTest src1 src2 ));
7865   effect(KILL cr);
7866   format %{ "vptest_anytrue_ge16 $dst,$src1,$src2\t! using $cr as TEMP" %}
7867   ins_encode %{
7868     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7869     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7870     __ setb(Assembler::notZero, $dst$$Register);
7871     __ movzbl($dst$$Register, $dst$$Register);
7872   %}
7873   ins_pipe( pipe_slow );
7874 %}
7875 
7876 instruct vptest_anytrue_evex(rRegI dst, kReg src1, kReg src2, rFlagsReg cr) %{
7877   predicate(VM_Version::supports_avx512bwdq() &&
7878             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7879   match(Set dst (VectorTest src1 src2));
7880   effect(KILL cr);
7881   format %{ "vptest_anytrue_lt8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
7882   ins_encode %{
7883     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7884     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7885     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7886     uint  masklen = Matcher::vector_length(this, $src1);
7887     __ anytrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister);
7888   %}
7889   ins_pipe( pipe_slow );
7890 %}
7891 
7892 instruct cmpvptest_anytrue_lt16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero, legVec vtmp) %{
7893   predicate(!VM_Version::supports_avx512bwdq() &&
7894             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 4 &&
7895             Matcher::vector_length_in_bytes(n->in(1)->in(1)) < 16 &&
7896             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7897   match(Set cr (CmpI (VectorTest src1 src2) zero));
7898   effect(TEMP vtmp);
7899   format %{ "cmpvptest_anytrue_lt16 $src1,$src2\t! using $vtmp as TEMP" %}
7900   ins_encode %{
7901     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7902     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7903   %}
7904   ins_pipe( pipe_slow );
7905 %}
7906 
7907 instruct cmpvptest_anytrue_ge16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero) %{
7908   predicate(!VM_Version::supports_avx512bwdq() &&
7909             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 16 &&
7910             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <  64 &&
7911             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7912   match(Set cr (CmpI (VectorTest src1 src2) zero));
7913   format %{ "cmpvptest_anytrue_ge16 $src1,$src2\t!" %}
7914   ins_encode %{
7915     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7916     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7917   %}
7918   ins_pipe( pipe_slow );
7919 %}
7920 
7921 instruct cmpvptest_anytrue_evex(rFlagsReg cr, kReg src1, kReg src2, immI_0 zero) %{
7922   predicate(VM_Version::supports_avx512bwdq() &&
7923             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7924   match(Set cr (CmpI (VectorTest src1 src2) zero));
7925   format %{ "cmpvptest_anytrue_evex $src1,$src2\t!" %}
7926   ins_encode %{
7927     uint masklen = Matcher::vector_length(this, $src1);
7928     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7929     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7930     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7931     masklen = masklen < 8 ? 8 : masklen;
7932     __ ktest(masklen, $src1$$KRegister, $src2$$KRegister);
7933   %}
7934   ins_pipe( pipe_slow );
7935 %}
7936 #endif
7937 
7938 //------------------------------------- LoadMask --------------------------------------------
7939 
7940 instruct loadMask(legVec dst, legVec src) %{
7941   predicate(n->bottom_type()->isa_vectmask() == NULL && !VM_Version::supports_avx512vlbw());
7942   match(Set dst (VectorLoadMask src));
7943   effect(TEMP dst);
7944   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
7945   ins_encode %{
7946     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
7947     BasicType elem_bt = Matcher::vector_element_basic_type(this);
7948     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
7949   %}
7950   ins_pipe( pipe_slow );
7951 %}
7952 
7953 instruct loadMask64(kReg dst, vec src, vec xtmp, rRegI tmp) %{
7954   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
7955   match(Set dst (VectorLoadMask src));
7956   effect(TEMP xtmp, TEMP tmp);
7957   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp and $tmp as TEMP" %}
7958   ins_encode %{
7959     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
7960                         $tmp$$Register, true, Assembler::AVX_512bit);
7961   %}
7962   ins_pipe( pipe_slow );
7963 %}
7964 
7965 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
7966   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
7967   match(Set dst (VectorLoadMask src));
7968   effect(TEMP xtmp);
7969   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
7970   ins_encode %{
7971     int vlen_enc = vector_length_encoding(in(1));
7972     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
7973                         noreg, false, vlen_enc);
7974   %}
7975   ins_pipe( pipe_slow );
7976 %}
7977 
7978 //------------------------------------- StoreMask --------------------------------------------
7979 
7980 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
7981   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
7982   match(Set dst (VectorStoreMask src size));
7983   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
7984   ins_encode %{
7985     int vlen = Matcher::vector_length(this);
7986     if (vlen <= 16 && UseAVX <= 2) {
7987       assert(UseSSE >= 3, "required");
7988       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7989     } else {
7990       assert(UseAVX > 0, "required");
7991       int src_vlen_enc = vector_length_encoding(this, $src);
7992       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7993     }
7994   %}
7995   ins_pipe( pipe_slow );
7996 %}
7997 
7998 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
7999   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8000   match(Set dst (VectorStoreMask src size));
8001   effect(TEMP_DEF dst, TEMP xtmp);
8002   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8003   ins_encode %{
8004     int vlen_enc = Assembler::AVX_128bit;
8005     int vlen = Matcher::vector_length(this);
8006     if (vlen <= 8) {
8007       assert(UseSSE >= 3, "required");
8008       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
8009       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
8010       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
8011     } else {
8012       assert(UseAVX > 0, "required");
8013       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
8014       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8015       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8016     }
8017   %}
8018   ins_pipe( pipe_slow );
8019 %}
8020 
8021 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
8022   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8023   match(Set dst (VectorStoreMask src size));
8024   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8025   effect(TEMP_DEF dst, TEMP xtmp);
8026   ins_encode %{
8027     int vlen_enc = Assembler::AVX_128bit;
8028     int vlen = Matcher::vector_length(this);
8029     if (vlen <= 4) {
8030       assert(UseSSE >= 3, "required");
8031       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
8032       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
8033       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
8034       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
8035     } else {
8036       assert(UseAVX > 0, "required");
8037       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
8038       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
8039       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8040       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
8041       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8042     }
8043   %}
8044   ins_pipe( pipe_slow );
8045 %}
8046 
8047 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
8048   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
8049   match(Set dst (VectorStoreMask src size));
8050   effect(TEMP_DEF dst, TEMP xtmp);
8051   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8052   ins_encode %{
8053     assert(UseSSE >= 3, "required");
8054     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
8055     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
8056     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
8057     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
8058     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
8059   %}
8060   ins_pipe( pipe_slow );
8061 %}
8062 
8063 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
8064   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
8065   match(Set dst (VectorStoreMask src size));
8066   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
8067   effect(TEMP_DEF dst, TEMP vtmp);
8068   ins_encode %{
8069     int vlen_enc = Assembler::AVX_128bit;
8070     __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
8071     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
8072     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
8073     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8074     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8075     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8076     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8077   %}
8078   ins_pipe( pipe_slow );
8079 %}
8080 
8081 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
8082   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8083   match(Set dst (VectorStoreMask src size));
8084   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8085   ins_encode %{
8086     int src_vlen_enc = vector_length_encoding(this, $src);
8087     int dst_vlen_enc = vector_length_encoding(this);
8088     if (!VM_Version::supports_avx512vl()) {
8089       src_vlen_enc = Assembler::AVX_512bit;
8090     }
8091     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
8092     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8093   %}
8094   ins_pipe( pipe_slow );
8095 %}
8096 
8097 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
8098   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8099   match(Set dst (VectorStoreMask src size));
8100   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8101   ins_encode %{
8102     int src_vlen_enc = vector_length_encoding(this, $src);
8103     int dst_vlen_enc = vector_length_encoding(this);
8104     if (!VM_Version::supports_avx512vl()) {
8105       src_vlen_enc = Assembler::AVX_512bit;
8106     }
8107     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
8108     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8109   %}
8110   ins_pipe( pipe_slow );
8111 %}
8112 
8113 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size, rRegI tmp) %{
8114   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
8115   match(Set dst (VectorStoreMask mask size));
8116   effect(TEMP_DEF dst, TEMP tmp);
8117   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
8118   ins_encode %{
8119     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
8120     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
8121                  false, Assembler::AVX_512bit, $tmp$$Register);
8122     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
8123   %}
8124   ins_pipe( pipe_slow );
8125 %}
8126 
8127 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
8128   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
8129   match(Set dst (VectorStoreMask mask size));
8130   effect(TEMP_DEF dst);
8131   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
8132   ins_encode %{
8133     int dst_vlen_enc = vector_length_encoding(this);
8134     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
8135     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8136   %}
8137   ins_pipe( pipe_slow );
8138 %}
8139 
8140 instruct vmaskcast_evex(kReg dst) %{
8141   predicate(Matcher::vector_length(n) == Matcher::vector_length(n->in(1)));
8142   match(Set dst (VectorMaskCast dst));
8143   ins_cost(0);
8144   format %{ "vector_mask_cast $dst" %}
8145   ins_encode %{
8146     // empty
8147   %}
8148   ins_pipe(empty);
8149 %}
8150 
8151 instruct vmaskcast(vec dst) %{
8152   predicate((Matcher::vector_length(n) == Matcher::vector_length(n->in(1))) &&
8153             (Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))));
8154   match(Set dst (VectorMaskCast dst));
8155   ins_cost(0);
8156   format %{ "vector_mask_cast $dst" %}
8157   ins_encode %{
8158     // empty
8159   %}
8160   ins_pipe(empty);
8161 %}
8162 
8163 //-------------------------------- Load Iota Indices ----------------------------------
8164 
8165 instruct loadIotaIndices(vec dst, immI_0 src, rRegP scratch) %{
8166   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
8167   match(Set dst (VectorLoadConst src));
8168   effect(TEMP scratch);
8169   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
8170   ins_encode %{
8171      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8172      __ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen_in_bytes);
8173   %}
8174   ins_pipe( pipe_slow );
8175 %}
8176 
8177 //-------------------------------- Rearrange ----------------------------------
8178 
8179 // LoadShuffle/Rearrange for Byte
8180 
8181 instruct loadShuffleB(vec dst) %{
8182   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
8183   match(Set dst (VectorLoadShuffle dst));
8184   format %{ "vector_load_shuffle $dst, $dst" %}
8185   ins_encode %{
8186     // empty
8187   %}
8188   ins_pipe( pipe_slow );
8189 %}
8190 
8191 instruct rearrangeB(vec dst, vec shuffle) %{
8192   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8193             Matcher::vector_length(n) < 32);
8194   match(Set dst (VectorRearrange dst shuffle));
8195   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8196   ins_encode %{
8197     assert(UseSSE >= 4, "required");
8198     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8199   %}
8200   ins_pipe( pipe_slow );
8201 %}
8202 
8203 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
8204   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8205             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
8206   match(Set dst (VectorRearrange src shuffle));
8207   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
8208   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
8209   ins_encode %{
8210     assert(UseAVX >= 2, "required");
8211     // Swap src into vtmp1
8212     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
8213     // Shuffle swapped src to get entries from other 128 bit lane
8214     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8215     // Shuffle original src to get entries from self 128 bit lane
8216     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8217     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
8218     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
8219     // Perform the blend
8220     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
8221   %}
8222   ins_pipe( pipe_slow );
8223 %}
8224 
8225 instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{
8226   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8227             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
8228   match(Set dst (VectorRearrange src shuffle));
8229   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8230   ins_encode %{
8231     int vlen_enc = vector_length_encoding(this);
8232     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8233   %}
8234   ins_pipe( pipe_slow );
8235 %}
8236 
8237 // LoadShuffle/Rearrange for Short
8238 
8239 instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{
8240   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8241             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
8242   match(Set dst (VectorLoadShuffle src));
8243   effect(TEMP dst, TEMP vtmp, TEMP scratch);
8244   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
8245   ins_encode %{
8246     // Create a byte shuffle mask from short shuffle mask
8247     // only byte shuffle instruction available on these platforms
8248     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8249     if (UseAVX == 0) {
8250       assert(vlen_in_bytes <= 16, "required");
8251       // Multiply each shuffle by two to get byte index
8252       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
8253       __ psllw($vtmp$$XMMRegister, 1);
8254 
8255       // Duplicate to create 2 copies of byte index
8256       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
8257       __ psllw($dst$$XMMRegister, 8);
8258       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
8259 
8260       // Add one to get alternate byte index
8261       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
8262       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
8263     } else {
8264       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
8265       int vlen_enc = vector_length_encoding(this);
8266       // Multiply each shuffle by two to get byte index
8267       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
8268       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
8269 
8270       // Duplicate to create 2 copies of byte index
8271       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
8272       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8273 
8274       // Add one to get alternate byte index
8275       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, $scratch$$Register);
8276     }
8277   %}
8278   ins_pipe( pipe_slow );
8279 %}
8280 
8281 instruct rearrangeS(vec dst, vec shuffle) %{
8282   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8283             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
8284   match(Set dst (VectorRearrange dst shuffle));
8285   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8286   ins_encode %{
8287     assert(UseSSE >= 4, "required");
8288     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8289   %}
8290   ins_pipe( pipe_slow );
8291 %}
8292 
8293 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
8294   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8295             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
8296   match(Set dst (VectorRearrange src shuffle));
8297   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
8298   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
8299   ins_encode %{
8300     assert(UseAVX >= 2, "required");
8301     // Swap src into vtmp1
8302     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
8303     // Shuffle swapped src to get entries from other 128 bit lane
8304     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8305     // Shuffle original src to get entries from self 128 bit lane
8306     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8307     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
8308     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
8309     // Perform the blend
8310     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
8311   %}
8312   ins_pipe( pipe_slow );
8313 %}
8314 
8315 instruct loadShuffleS_evex(vec dst, vec src) %{
8316   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8317             VM_Version::supports_avx512bw());
8318   match(Set dst (VectorLoadShuffle src));
8319   format %{ "vector_load_shuffle $dst, $src" %}
8320   ins_encode %{
8321     int vlen_enc = vector_length_encoding(this);
8322     if (!VM_Version::supports_avx512vl()) {
8323       vlen_enc = Assembler::AVX_512bit;
8324     }
8325     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8326   %}
8327   ins_pipe( pipe_slow );
8328 %}
8329 
8330 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
8331   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8332             VM_Version::supports_avx512bw());
8333   match(Set dst (VectorRearrange src shuffle));
8334   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8335   ins_encode %{
8336     int vlen_enc = vector_length_encoding(this);
8337     if (!VM_Version::supports_avx512vl()) {
8338       vlen_enc = Assembler::AVX_512bit;
8339     }
8340     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8341   %}
8342   ins_pipe( pipe_slow );
8343 %}
8344 
8345 // LoadShuffle/Rearrange for Integer and Float
8346 
8347 instruct loadShuffleI(vec dst, vec src, vec vtmp, rRegP scratch) %{
8348   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8349             Matcher::vector_length(n) == 4 && UseAVX < 2);
8350   match(Set dst (VectorLoadShuffle src));
8351   effect(TEMP dst, TEMP vtmp, TEMP scratch);
8352   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
8353   ins_encode %{
8354     assert(UseSSE >= 4, "required");
8355 
8356     // Create a byte shuffle mask from int shuffle mask
8357     // only byte shuffle instruction available on these platforms
8358 
8359     // Duplicate and multiply each shuffle by 4
8360     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
8361     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
8362     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
8363     __ psllw($vtmp$$XMMRegister, 2);
8364 
8365     // Duplicate again to create 4 copies of byte index
8366     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
8367     __ psllw($dst$$XMMRegister, 8);
8368     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
8369 
8370     // Add 3,2,1,0 to get alternate byte index
8371     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), $scratch$$Register);
8372     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
8373   %}
8374   ins_pipe( pipe_slow );
8375 %}
8376 
8377 instruct rearrangeI(vec dst, vec shuffle) %{
8378  predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8379            Matcher::vector_length(n) == 4 && UseAVX < 2);
8380   match(Set dst (VectorRearrange dst shuffle));
8381   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8382   ins_encode %{
8383     assert(UseSSE >= 4, "required");
8384     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8385   %}
8386   ins_pipe( pipe_slow );
8387 %}
8388 
8389 instruct loadShuffleI_avx(vec dst, vec src) %{
8390   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8391             UseAVX >= 2);
8392   match(Set dst (VectorLoadShuffle src));
8393   format %{ "vector_load_shuffle $dst, $src" %}
8394   ins_encode %{
8395   int vlen_enc = vector_length_encoding(this);
8396     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8397   %}
8398   ins_pipe( pipe_slow );
8399 %}
8400 
8401 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
8402   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8403             UseAVX >= 2);
8404   match(Set dst (VectorRearrange src shuffle));
8405   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8406   ins_encode %{
8407     int vlen_enc = vector_length_encoding(this);
8408     if (vlen_enc == Assembler::AVX_128bit) {
8409       vlen_enc = Assembler::AVX_256bit;
8410     }
8411     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8412   %}
8413   ins_pipe( pipe_slow );
8414 %}
8415 
8416 // LoadShuffle/Rearrange for Long and Double
8417 
8418 instruct loadShuffleL(vec dst, vec src, vec vtmp, rRegP scratch) %{
8419   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8420             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
8421   match(Set dst (VectorLoadShuffle src));
8422   effect(TEMP dst, TEMP vtmp, TEMP scratch);
8423   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
8424   ins_encode %{
8425     assert(UseAVX >= 2, "required");
8426 
8427     int vlen_enc = vector_length_encoding(this);
8428     // Create a double word shuffle mask from long shuffle mask
8429     // only double word shuffle instruction available on these platforms
8430 
8431     // Multiply each shuffle by two to get double word index
8432     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
8433     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
8434 
8435     // Duplicate each double word shuffle
8436     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
8437     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8438 
8439     // Add one to get alternate double word index
8440     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, $scratch$$Register);
8441   %}
8442   ins_pipe( pipe_slow );
8443 %}
8444 
8445 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
8446   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8447             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
8448   match(Set dst (VectorRearrange src shuffle));
8449   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8450   ins_encode %{
8451     assert(UseAVX >= 2, "required");
8452 
8453     int vlen_enc = vector_length_encoding(this);
8454     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8455   %}
8456   ins_pipe( pipe_slow );
8457 %}
8458 
8459 instruct loadShuffleL_evex(vec dst, vec src) %{
8460   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8461             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
8462   match(Set dst (VectorLoadShuffle src));
8463   format %{ "vector_load_shuffle $dst, $src" %}
8464   ins_encode %{
8465     assert(UseAVX > 2, "required");
8466 
8467     int vlen_enc = vector_length_encoding(this);
8468     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8469   %}
8470   ins_pipe( pipe_slow );
8471 %}
8472 
8473 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
8474   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8475             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
8476   match(Set dst (VectorRearrange src shuffle));
8477   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8478   ins_encode %{
8479     assert(UseAVX > 2, "required");
8480 
8481     int vlen_enc = vector_length_encoding(this);
8482     if (vlen_enc == Assembler::AVX_128bit) {
8483       vlen_enc = Assembler::AVX_256bit;
8484     }
8485     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8486   %}
8487   ins_pipe( pipe_slow );
8488 %}
8489 
8490 // --------------------------------- FMA --------------------------------------
8491 // a * b + c
8492 
8493 instruct vfmaF_reg(vec a, vec b, vec c) %{
8494   match(Set c (FmaVF  c (Binary a b)));
8495   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8496   ins_cost(150);
8497   ins_encode %{
8498     assert(UseFMA, "not enabled");
8499     int vlen_enc = vector_length_encoding(this);
8500     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8501   %}
8502   ins_pipe( pipe_slow );
8503 %}
8504 
8505 instruct vfmaF_mem(vec a, memory b, vec c) %{
8506   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8507   match(Set c (FmaVF  c (Binary a (LoadVector b))));
8508   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8509   ins_cost(150);
8510   ins_encode %{
8511     assert(UseFMA, "not enabled");
8512     int vlen_enc = vector_length_encoding(this);
8513     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8514   %}
8515   ins_pipe( pipe_slow );
8516 %}
8517 
8518 instruct vfmaD_reg(vec a, vec b, vec c) %{
8519   match(Set c (FmaVD  c (Binary a b)));
8520   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8521   ins_cost(150);
8522   ins_encode %{
8523     assert(UseFMA, "not enabled");
8524     int vlen_enc = vector_length_encoding(this);
8525     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8526   %}
8527   ins_pipe( pipe_slow );
8528 %}
8529 
8530 instruct vfmaD_mem(vec a, memory b, vec c) %{
8531   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8532   match(Set c (FmaVD  c (Binary a (LoadVector b))));
8533   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8534   ins_cost(150);
8535   ins_encode %{
8536     assert(UseFMA, "not enabled");
8537     int vlen_enc = vector_length_encoding(this);
8538     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8539   %}
8540   ins_pipe( pipe_slow );
8541 %}
8542 
8543 // --------------------------------- Vector Multiply Add --------------------------------------
8544 
8545 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
8546   predicate(UseAVX == 0);
8547   match(Set dst (MulAddVS2VI dst src1));
8548   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
8549   ins_encode %{
8550     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
8551   %}
8552   ins_pipe( pipe_slow );
8553 %}
8554 
8555 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
8556   predicate(UseAVX > 0);
8557   match(Set dst (MulAddVS2VI src1 src2));
8558   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
8559   ins_encode %{
8560     int vlen_enc = vector_length_encoding(this);
8561     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8562   %}
8563   ins_pipe( pipe_slow );
8564 %}
8565 
8566 // --------------------------------- Vector Multiply Add Add ----------------------------------
8567 
8568 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
8569   predicate(VM_Version::supports_avx512_vnni());
8570   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
8571   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
8572   ins_encode %{
8573     assert(UseAVX > 2, "required");
8574     int vlen_enc = vector_length_encoding(this);
8575     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8576   %}
8577   ins_pipe( pipe_slow );
8578   ins_cost(10);
8579 %}
8580 
8581 // --------------------------------- PopCount --------------------------------------
8582 
8583 instruct vpopcountI(vec dst, vec src) %{
8584   match(Set dst (PopCountVI src));
8585   format %{ "vpopcntd  $dst,$src\t! vector popcount packedI" %}
8586   ins_encode %{
8587     assert(UsePopCountInstruction, "not enabled");
8588 
8589     int vlen_enc = vector_length_encoding(this);
8590     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8591   %}
8592   ins_pipe( pipe_slow );
8593 %}
8594 
8595 instruct vpopcountL(vec dst, vec src) %{
8596   match(Set dst (PopCountVL src));
8597   format %{ "vpopcntq  $dst,$src\t! vector popcount packedL" %}
8598   ins_encode %{
8599     assert(UsePopCountInstruction, "not enabled");
8600 
8601     int vlen_enc = vector_length_encoding(this, $src);
8602     __ vpopcntq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8603     __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8604 
8605   %}
8606   ins_pipe( pipe_slow );
8607 %}
8608 
8609 // --------------------------------- Bitwise Ternary Logic ----------------------------------
8610 
8611 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
8612   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
8613   effect(TEMP dst);
8614   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8615   ins_encode %{
8616     int vector_len = vector_length_encoding(this);
8617     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
8618   %}
8619   ins_pipe( pipe_slow );
8620 %}
8621 
8622 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
8623   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
8624   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
8625   effect(TEMP dst);
8626   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8627   ins_encode %{
8628     int vector_len = vector_length_encoding(this);
8629     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
8630   %}
8631   ins_pipe( pipe_slow );
8632 %}
8633 
8634 // --------------------------------- Rotation Operations ----------------------------------
8635 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
8636   match(Set dst (RotateLeftV src shift));
8637   match(Set dst (RotateRightV src shift));
8638   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
8639   ins_encode %{
8640     int opcode      = this->ideal_Opcode();
8641     int vector_len  = vector_length_encoding(this);
8642     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8643     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
8644   %}
8645   ins_pipe( pipe_slow );
8646 %}
8647 
8648 instruct vprorate(vec dst, vec src, vec shift) %{
8649   match(Set dst (RotateLeftV src shift));
8650   match(Set dst (RotateRightV src shift));
8651   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
8652   ins_encode %{
8653     int opcode      = this->ideal_Opcode();
8654     int vector_len  = vector_length_encoding(this);
8655     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8656     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8657   %}
8658   ins_pipe( pipe_slow );
8659 %}
8660 
8661 #ifdef _LP64
8662 // ---------------------------------- Masked Operations ------------------------------------
8663 
8664 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
8665   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
8666   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
8667   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
8668   ins_encode %{
8669     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
8670     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
8671 
8672     Label DONE;
8673     int vlen_enc = vector_length_encoding(this, $src1);
8674     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
8675 
8676     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
8677     __ mov64($dst$$Register, -1L);
8678     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
8679     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
8680     __ jccb(Assembler::carrySet, DONE);
8681     __ kmovql($dst$$Register, $ktmp1$$KRegister);
8682     __ notq($dst$$Register);
8683     __ tzcntq($dst$$Register, $dst$$Register);
8684     __ bind(DONE);
8685   %}
8686   ins_pipe( pipe_slow );
8687 %}
8688 
8689 
8690 instruct vmasked_load64(vec dst, memory mem, kReg mask) %{
8691   match(Set dst (LoadVectorMasked mem mask));
8692   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
8693   ins_encode %{
8694     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
8695     int vector_len = vector_length_encoding(this);
8696     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, vector_len);
8697   %}
8698   ins_pipe( pipe_slow );
8699 %}
8700 
8701 instruct vmask_gen(kReg dst, rRegL len, rRegL temp) %{
8702   match(Set dst (VectorMaskGen len));
8703   effect(TEMP temp);
8704   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
8705   ins_encode %{
8706     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
8707   %}
8708   ins_pipe( pipe_slow );
8709 %}
8710 
8711 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
8712   match(Set dst (VectorMaskGen len));
8713   format %{ "vector_mask_gen $len \t! vector mask generator" %}
8714   effect(TEMP temp);
8715   ins_encode %{
8716     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
8717     __ kmovql($dst$$KRegister, $temp$$Register);
8718   %}
8719   ins_pipe( pipe_slow );
8720 %}
8721 
8722 instruct vmasked_store64(memory mem, vec src, kReg mask) %{
8723   match(Set mem (StoreVectorMasked mem (Binary src mask)));
8724   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
8725   ins_encode %{
8726     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
8727     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
8728     int vector_len = vector_length_encoding(src_node);
8729     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, vector_len);
8730   %}
8731   ins_pipe( pipe_slow );
8732 %}
8733 
8734 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
8735   predicate(n->in(1)->bottom_type()->isa_vectmask());
8736   match(Set dst (VectorMaskToLong mask));
8737   effect(TEMP dst, KILL cr);
8738   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
8739   ins_encode %{
8740     int opcode = this->ideal_Opcode();
8741     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8742     int mask_len = Matcher::vector_length(this, $mask);
8743     int mask_size = mask_len * type2aelembytes(mbt);
8744     int vlen_enc = vector_length_encoding(this, $mask);
8745     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
8746                              $dst$$Register, mask_len, mask_size, vlen_enc);
8747   %}
8748   ins_pipe( pipe_slow );
8749 %}
8750 
8751 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
8752   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
8753   match(Set dst (VectorMaskToLong mask));
8754   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
8755   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
8756   ins_encode %{
8757     int opcode = this->ideal_Opcode();
8758     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8759     int mask_len = Matcher::vector_length(this, $mask);
8760     int vlen_enc = vector_length_encoding(this, $mask);
8761     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8762                              $dst$$Register, mask_len, mbt, vlen_enc);
8763   %}
8764   ins_pipe( pipe_slow );
8765 %}
8766 
8767 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
8768   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
8769   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
8770   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
8771   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
8772   ins_encode %{
8773     int opcode = this->ideal_Opcode();
8774     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8775     int mask_len = Matcher::vector_length(this, $mask);
8776     int vlen_enc = vector_length_encoding(this, $mask);
8777     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8778                              $dst$$Register, mask_len, mbt, vlen_enc);
8779   %}
8780   ins_pipe( pipe_slow );
8781 %}
8782 
8783 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
8784   predicate(n->in(1)->bottom_type()->isa_vectmask());
8785   match(Set dst (VectorMaskTrueCount mask));
8786   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
8787   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
8788   ins_encode %{
8789     int opcode = this->ideal_Opcode();
8790     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8791     int mask_len = Matcher::vector_length(this, $mask);
8792     int mask_size = mask_len * type2aelembytes(mbt);
8793     int vlen_enc = vector_length_encoding(this, $mask);
8794     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
8795                              $tmp$$Register, mask_len, mask_size, vlen_enc);
8796   %}
8797   ins_pipe( pipe_slow );
8798 %}
8799 
8800 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
8801   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
8802   match(Set dst (VectorMaskTrueCount mask));
8803   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
8804   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
8805   ins_encode %{
8806     int opcode = this->ideal_Opcode();
8807     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8808     int mask_len = Matcher::vector_length(this, $mask);
8809     int vlen_enc = vector_length_encoding(this, $mask);
8810     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8811                              $tmp$$Register, mask_len, mbt, vlen_enc);
8812   %}
8813   ins_pipe( pipe_slow );
8814 %}
8815 
8816 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
8817   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
8818   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
8819   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
8820   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
8821   ins_encode %{
8822     int opcode = this->ideal_Opcode();
8823     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8824     int mask_len = Matcher::vector_length(this, $mask);
8825     int vlen_enc = vector_length_encoding(this, $mask);
8826     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8827                              $tmp$$Register, mask_len, mbt, vlen_enc);
8828   %}
8829   ins_pipe( pipe_slow );
8830 %}
8831 
8832 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
8833   predicate(n->in(1)->bottom_type()->isa_vectmask());
8834   match(Set dst (VectorMaskFirstTrue mask));
8835   match(Set dst (VectorMaskLastTrue mask));
8836   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
8837   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
8838   ins_encode %{
8839     int opcode = this->ideal_Opcode();
8840     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8841     int mask_len = Matcher::vector_length(this, $mask);
8842     int mask_size = mask_len * type2aelembytes(mbt);
8843     int vlen_enc = vector_length_encoding(this, $mask);
8844     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
8845                              $tmp$$Register, mask_len, mask_size, vlen_enc);
8846   %}
8847   ins_pipe( pipe_slow );
8848 %}
8849 
8850 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
8851   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
8852   match(Set dst (VectorMaskFirstTrue mask));
8853   match(Set dst (VectorMaskLastTrue mask));
8854   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
8855   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
8856   ins_encode %{
8857     int opcode = this->ideal_Opcode();
8858     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8859     int mask_len = Matcher::vector_length(this, $mask);
8860     int vlen_enc = vector_length_encoding(this, $mask);
8861     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8862                              $tmp$$Register, mask_len, mbt, vlen_enc);
8863   %}
8864   ins_pipe( pipe_slow );
8865 %}
8866 
8867 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
8868   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
8869   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
8870   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
8871   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
8872   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
8873   ins_encode %{
8874     int opcode = this->ideal_Opcode();
8875     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8876     int mask_len = Matcher::vector_length(this, $mask);
8877     int vlen_enc = vector_length_encoding(this, $mask);
8878     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8879                              $tmp$$Register, mask_len, mbt, vlen_enc);
8880   %}
8881   ins_pipe( pipe_slow );
8882 %}
8883 #endif // _LP64
8884 
8885 // ---------------------------------- Vector Masked Operations ------------------------------------
8886 
8887 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
8888   match(Set dst (AddVB (Binary dst src2) mask));
8889   match(Set dst (AddVS (Binary dst src2) mask));
8890   match(Set dst (AddVI (Binary dst src2) mask));
8891   match(Set dst (AddVL (Binary dst src2) mask));
8892   match(Set dst (AddVF (Binary dst src2) mask));
8893   match(Set dst (AddVD (Binary dst src2) mask));
8894   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
8895   ins_encode %{
8896     int vlen_enc = vector_length_encoding(this);
8897     BasicType bt = Matcher::vector_element_basic_type(this);
8898     int opc = this->ideal_Opcode();
8899     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8900                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8901   %}
8902   ins_pipe( pipe_slow );
8903 %}
8904 
8905 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
8906   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
8907   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
8908   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
8909   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
8910   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
8911   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
8912   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
8913   ins_encode %{
8914     int vlen_enc = vector_length_encoding(this);
8915     BasicType bt = Matcher::vector_element_basic_type(this);
8916     int opc = this->ideal_Opcode();
8917     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8918                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8919   %}
8920   ins_pipe( pipe_slow );
8921 %}
8922 
8923 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
8924   match(Set dst (XorV (Binary dst src2) mask));
8925   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
8926   ins_encode %{
8927     int vlen_enc = vector_length_encoding(this);
8928     BasicType bt = Matcher::vector_element_basic_type(this);
8929     int opc = this->ideal_Opcode();
8930     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8931                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8932   %}
8933   ins_pipe( pipe_slow );
8934 %}
8935 
8936 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
8937   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
8938   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
8939   ins_encode %{
8940     int vlen_enc = vector_length_encoding(this);
8941     BasicType bt = Matcher::vector_element_basic_type(this);
8942     int opc = this->ideal_Opcode();
8943     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8944                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8945   %}
8946   ins_pipe( pipe_slow );
8947 %}
8948 
8949 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
8950   match(Set dst (OrV (Binary dst src2) mask));
8951   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
8952   ins_encode %{
8953     int vlen_enc = vector_length_encoding(this);
8954     BasicType bt = Matcher::vector_element_basic_type(this);
8955     int opc = this->ideal_Opcode();
8956     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8957                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8958   %}
8959   ins_pipe( pipe_slow );
8960 %}
8961 
8962 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
8963   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
8964   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
8965   ins_encode %{
8966     int vlen_enc = vector_length_encoding(this);
8967     BasicType bt = Matcher::vector_element_basic_type(this);
8968     int opc = this->ideal_Opcode();
8969     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8970                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8971   %}
8972   ins_pipe( pipe_slow );
8973 %}
8974 
8975 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
8976   match(Set dst (AndV (Binary dst src2) mask));
8977   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
8978   ins_encode %{
8979     int vlen_enc = vector_length_encoding(this);
8980     BasicType bt = Matcher::vector_element_basic_type(this);
8981     int opc = this->ideal_Opcode();
8982     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8983                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8984   %}
8985   ins_pipe( pipe_slow );
8986 %}
8987 
8988 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
8989   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
8990   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
8991   ins_encode %{
8992     int vlen_enc = vector_length_encoding(this);
8993     BasicType bt = Matcher::vector_element_basic_type(this);
8994     int opc = this->ideal_Opcode();
8995     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8996                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8997   %}
8998   ins_pipe( pipe_slow );
8999 %}
9000 
9001 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
9002   match(Set dst (SubVB (Binary dst src2) mask));
9003   match(Set dst (SubVS (Binary dst src2) mask));
9004   match(Set dst (SubVI (Binary dst src2) mask));
9005   match(Set dst (SubVL (Binary dst src2) mask));
9006   match(Set dst (SubVF (Binary dst src2) mask));
9007   match(Set dst (SubVD (Binary dst src2) mask));
9008   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
9009   ins_encode %{
9010     int vlen_enc = vector_length_encoding(this);
9011     BasicType bt = Matcher::vector_element_basic_type(this);
9012     int opc = this->ideal_Opcode();
9013     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9014                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9015   %}
9016   ins_pipe( pipe_slow );
9017 %}
9018 
9019 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
9020   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
9021   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
9022   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
9023   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
9024   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
9025   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
9026   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
9027   ins_encode %{
9028     int vlen_enc = vector_length_encoding(this);
9029     BasicType bt = Matcher::vector_element_basic_type(this);
9030     int opc = this->ideal_Opcode();
9031     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9032                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9033   %}
9034   ins_pipe( pipe_slow );
9035 %}
9036 
9037 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
9038   match(Set dst (MulVS (Binary dst src2) mask));
9039   match(Set dst (MulVI (Binary dst src2) mask));
9040   match(Set dst (MulVL (Binary dst src2) mask));
9041   match(Set dst (MulVF (Binary dst src2) mask));
9042   match(Set dst (MulVD (Binary dst src2) mask));
9043   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
9044   ins_encode %{
9045     int vlen_enc = vector_length_encoding(this);
9046     BasicType bt = Matcher::vector_element_basic_type(this);
9047     int opc = this->ideal_Opcode();
9048     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9049                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9050   %}
9051   ins_pipe( pipe_slow );
9052 %}
9053 
9054 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
9055   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
9056   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
9057   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
9058   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
9059   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
9060   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
9061   ins_encode %{
9062     int vlen_enc = vector_length_encoding(this);
9063     BasicType bt = Matcher::vector_element_basic_type(this);
9064     int opc = this->ideal_Opcode();
9065     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9066                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9067   %}
9068   ins_pipe( pipe_slow );
9069 %}
9070 
9071 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
9072   match(Set dst (SqrtVF dst mask));
9073   match(Set dst (SqrtVD dst mask));
9074   ins_cost(100);
9075   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
9076   ins_encode %{
9077     int vlen_enc = vector_length_encoding(this);
9078     BasicType bt = Matcher::vector_element_basic_type(this);
9079     int opc = this->ideal_Opcode();
9080     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9081                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
9082   %}
9083   ins_pipe( pipe_slow );
9084 %}
9085 
9086 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
9087   match(Set dst (DivVF (Binary dst src2) mask));
9088   match(Set dst (DivVD (Binary dst src2) mask));
9089   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
9090   ins_encode %{
9091     int vlen_enc = vector_length_encoding(this);
9092     BasicType bt = Matcher::vector_element_basic_type(this);
9093     int opc = this->ideal_Opcode();
9094     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9095                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9096   %}
9097   ins_pipe( pipe_slow );
9098 %}
9099 
9100 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
9101   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
9102   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
9103   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
9104   ins_encode %{
9105     int vlen_enc = vector_length_encoding(this);
9106     BasicType bt = Matcher::vector_element_basic_type(this);
9107     int opc = this->ideal_Opcode();
9108     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9109                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9110   %}
9111   ins_pipe( pipe_slow );
9112 %}
9113 
9114 
9115 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
9116   match(Set dst (RotateLeftV (Binary dst shift) mask));
9117   match(Set dst (RotateRightV (Binary dst shift) mask));
9118   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
9119   ins_encode %{
9120     int vlen_enc = vector_length_encoding(this);
9121     BasicType bt = Matcher::vector_element_basic_type(this);
9122     int opc = this->ideal_Opcode();
9123     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9124                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9125   %}
9126   ins_pipe( pipe_slow );
9127 %}
9128 
9129 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
9130   match(Set dst (RotateLeftV (Binary dst src2) mask));
9131   match(Set dst (RotateRightV (Binary dst src2) mask));
9132   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
9133   ins_encode %{
9134     int vlen_enc = vector_length_encoding(this);
9135     BasicType bt = Matcher::vector_element_basic_type(this);
9136     int opc = this->ideal_Opcode();
9137     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9138                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9139   %}
9140   ins_pipe( pipe_slow );
9141 %}
9142 
9143 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9144   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
9145   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
9146   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
9147   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
9148   ins_encode %{
9149     int vlen_enc = vector_length_encoding(this);
9150     BasicType bt = Matcher::vector_element_basic_type(this);
9151     int opc = this->ideal_Opcode();
9152     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9153                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9154   %}
9155   ins_pipe( pipe_slow );
9156 %}
9157 
9158 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
9159   predicate(!n->as_ShiftV()->is_var_shift());
9160   match(Set dst (LShiftVS (Binary dst src2) mask));
9161   match(Set dst (LShiftVI (Binary dst src2) mask));
9162   match(Set dst (LShiftVL (Binary dst src2) mask));
9163   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
9164   ins_encode %{
9165     int vlen_enc = vector_length_encoding(this);
9166     BasicType bt = Matcher::vector_element_basic_type(this);
9167     int opc = this->ideal_Opcode();
9168     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9169                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
9170   %}
9171   ins_pipe( pipe_slow );
9172 %}
9173 
9174 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
9175   predicate(n->as_ShiftV()->is_var_shift());
9176   match(Set dst (LShiftVS (Binary dst src2) mask));
9177   match(Set dst (LShiftVI (Binary dst src2) mask));
9178   match(Set dst (LShiftVL (Binary dst src2) mask));
9179   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
9180   ins_encode %{
9181     int vlen_enc = vector_length_encoding(this);
9182     BasicType bt = Matcher::vector_element_basic_type(this);
9183     int opc = this->ideal_Opcode();
9184     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9185                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
9186   %}
9187   ins_pipe( pipe_slow );
9188 %}
9189 
9190 instruct vlshift_mem_masked(vec dst, memory src2, kReg mask) %{
9191   match(Set dst (LShiftVS (Binary dst (LoadVector src2)) mask));
9192   match(Set dst (LShiftVI (Binary dst (LoadVector src2)) mask));
9193   match(Set dst (LShiftVL (Binary dst (LoadVector src2)) mask));
9194   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
9195   ins_encode %{
9196     int vlen_enc = vector_length_encoding(this);
9197     BasicType bt = Matcher::vector_element_basic_type(this);
9198     int opc = this->ideal_Opcode();
9199     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9200                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9201   %}
9202   ins_pipe( pipe_slow );
9203 %}
9204 
9205 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9206   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
9207   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
9208   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
9209   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
9210   ins_encode %{
9211     int vlen_enc = vector_length_encoding(this);
9212     BasicType bt = Matcher::vector_element_basic_type(this);
9213     int opc = this->ideal_Opcode();
9214     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9215                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9216   %}
9217   ins_pipe( pipe_slow );
9218 %}
9219 
9220 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
9221   predicate(!n->as_ShiftV()->is_var_shift());
9222   match(Set dst (RShiftVS (Binary dst src2) mask));
9223   match(Set dst (RShiftVI (Binary dst src2) mask));
9224   match(Set dst (RShiftVL (Binary dst src2) mask));
9225   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
9226   ins_encode %{
9227     int vlen_enc = vector_length_encoding(this);
9228     BasicType bt = Matcher::vector_element_basic_type(this);
9229     int opc = this->ideal_Opcode();
9230     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9231                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
9232   %}
9233   ins_pipe( pipe_slow );
9234 %}
9235 
9236 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
9237   predicate(n->as_ShiftV()->is_var_shift());
9238   match(Set dst (RShiftVS (Binary dst src2) mask));
9239   match(Set dst (RShiftVI (Binary dst src2) mask));
9240   match(Set dst (RShiftVL (Binary dst src2) mask));
9241   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
9242   ins_encode %{
9243     int vlen_enc = vector_length_encoding(this);
9244     BasicType bt = Matcher::vector_element_basic_type(this);
9245     int opc = this->ideal_Opcode();
9246     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9247                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
9248   %}
9249   ins_pipe( pipe_slow );
9250 %}
9251 
9252 instruct vrshift_mem_masked(vec dst, memory src2, kReg mask) %{
9253   match(Set dst (RShiftVS (Binary dst (LoadVector src2)) mask));
9254   match(Set dst (RShiftVI (Binary dst (LoadVector src2)) mask));
9255   match(Set dst (RShiftVL (Binary dst (LoadVector src2)) mask));
9256   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
9257   ins_encode %{
9258     int vlen_enc = vector_length_encoding(this);
9259     BasicType bt = Matcher::vector_element_basic_type(this);
9260     int opc = this->ideal_Opcode();
9261     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9262                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9263   %}
9264   ins_pipe( pipe_slow );
9265 %}
9266 
9267 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9268   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
9269   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
9270   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
9271   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
9272   ins_encode %{
9273     int vlen_enc = vector_length_encoding(this);
9274     BasicType bt = Matcher::vector_element_basic_type(this);
9275     int opc = this->ideal_Opcode();
9276     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9277                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9278   %}
9279   ins_pipe( pipe_slow );
9280 %}
9281 
9282 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
9283   predicate(!n->as_ShiftV()->is_var_shift());
9284   match(Set dst (URShiftVS (Binary dst src2) mask));
9285   match(Set dst (URShiftVI (Binary dst src2) mask));
9286   match(Set dst (URShiftVL (Binary dst src2) mask));
9287   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
9288   ins_encode %{
9289     int vlen_enc = vector_length_encoding(this);
9290     BasicType bt = Matcher::vector_element_basic_type(this);
9291     int opc = this->ideal_Opcode();
9292     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9293                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
9294   %}
9295   ins_pipe( pipe_slow );
9296 %}
9297 
9298 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
9299   predicate(n->as_ShiftV()->is_var_shift());
9300   match(Set dst (URShiftVS (Binary dst src2) mask));
9301   match(Set dst (URShiftVI (Binary dst src2) mask));
9302   match(Set dst (URShiftVL (Binary dst src2) mask));
9303   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
9304   ins_encode %{
9305     int vlen_enc = vector_length_encoding(this);
9306     BasicType bt = Matcher::vector_element_basic_type(this);
9307     int opc = this->ideal_Opcode();
9308     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9309                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
9310   %}
9311   ins_pipe( pipe_slow );
9312 %}
9313 
9314 instruct vurshift_mem_masked(vec dst, memory src2, kReg mask) %{
9315   match(Set dst (URShiftVS (Binary dst (LoadVector src2)) mask));
9316   match(Set dst (URShiftVI (Binary dst (LoadVector src2)) mask));
9317   match(Set dst (URShiftVL (Binary dst (LoadVector src2)) mask));
9318   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
9319   ins_encode %{
9320     int vlen_enc = vector_length_encoding(this);
9321     BasicType bt = Matcher::vector_element_basic_type(this);
9322     int opc = this->ideal_Opcode();
9323     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9324                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9325   %}
9326   ins_pipe( pipe_slow );
9327 %}
9328 
9329 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
9330   match(Set dst (MaxV (Binary dst src2) mask));
9331   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
9332   ins_encode %{
9333     int vlen_enc = vector_length_encoding(this);
9334     BasicType bt = Matcher::vector_element_basic_type(this);
9335     int opc = this->ideal_Opcode();
9336     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9337                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9338   %}
9339   ins_pipe( pipe_slow );
9340 %}
9341 
9342 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
9343   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
9344   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
9345   ins_encode %{
9346     int vlen_enc = vector_length_encoding(this);
9347     BasicType bt = Matcher::vector_element_basic_type(this);
9348     int opc = this->ideal_Opcode();
9349     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9350                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9351   %}
9352   ins_pipe( pipe_slow );
9353 %}
9354 
9355 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
9356   match(Set dst (MinV (Binary dst src2) mask));
9357   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
9358   ins_encode %{
9359     int vlen_enc = vector_length_encoding(this);
9360     BasicType bt = Matcher::vector_element_basic_type(this);
9361     int opc = this->ideal_Opcode();
9362     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9363                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9364   %}
9365   ins_pipe( pipe_slow );
9366 %}
9367 
9368 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
9369   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
9370   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
9371   ins_encode %{
9372     int vlen_enc = vector_length_encoding(this);
9373     BasicType bt = Matcher::vector_element_basic_type(this);
9374     int opc = this->ideal_Opcode();
9375     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9376                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9377   %}
9378   ins_pipe( pipe_slow );
9379 %}
9380 
9381 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
9382   match(Set dst (VectorRearrange (Binary dst src2) mask));
9383   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
9384   ins_encode %{
9385     int vlen_enc = vector_length_encoding(this);
9386     BasicType bt = Matcher::vector_element_basic_type(this);
9387     int opc = this->ideal_Opcode();
9388     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9389                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
9390   %}
9391   ins_pipe( pipe_slow );
9392 %}
9393 
9394 instruct vabs_masked(vec dst, kReg mask) %{
9395   match(Set dst (AbsVB dst mask));
9396   match(Set dst (AbsVS dst mask));
9397   match(Set dst (AbsVI dst mask));
9398   match(Set dst (AbsVL dst mask));
9399   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
9400   ins_cost(100);
9401   ins_encode %{
9402     int vlen_enc = vector_length_encoding(this);
9403     BasicType bt = Matcher::vector_element_basic_type(this);
9404     int opc = this->ideal_Opcode();
9405     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9406                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
9407   %}
9408   ins_pipe( pipe_slow );
9409 %}
9410 
9411 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
9412   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
9413   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
9414   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
9415   ins_encode %{
9416     int vlen_enc = vector_length_encoding(this);
9417     BasicType bt = Matcher::vector_element_basic_type(this);
9418     int opc = this->ideal_Opcode();
9419     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9420                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
9421   %}
9422   ins_pipe( pipe_slow );
9423 %}
9424 
9425 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
9426   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
9427   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
9428   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
9429   ins_encode %{
9430     int vlen_enc = vector_length_encoding(this);
9431     BasicType bt = Matcher::vector_element_basic_type(this);
9432     int opc = this->ideal_Opcode();
9433     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9434                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
9435   %}
9436   ins_pipe( pipe_slow );
9437 %}
9438 
9439 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask, rRegP scratch) %{
9440   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
9441   effect(TEMP scratch);
9442   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask\t! using $scratch as TEMP" %}
9443   ins_encode %{
9444     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
9445     int vlen_enc = vector_length_encoding(this, $src1);
9446     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
9447 
9448     // Comparison i
9449     switch (src1_elem_bt) {
9450       case T_BYTE: {
9451         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9452         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9453         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9454         break;
9455       }
9456       case T_SHORT: {
9457         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9458         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9459         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9460         break;
9461       }
9462       case T_INT: {
9463         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9464         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9465         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9466         break;
9467       }
9468       case T_LONG: {
9469         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9470         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9471         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9472         break;
9473       }
9474       case T_FLOAT: {
9475         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
9476         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
9477         break;
9478       }
9479       case T_DOUBLE: {
9480         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
9481         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
9482         break;
9483       }
9484       default: assert(false, "%s", type2name(src1_elem_bt)); break;
9485     }
9486   %}
9487   ins_pipe( pipe_slow );
9488 %}
9489 
9490 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
9491   predicate(Matcher::vector_length(n) <= 32);
9492   match(Set dst (MaskAll src));
9493   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
9494   ins_encode %{
9495     int mask_len = Matcher::vector_length(this);
9496     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
9497   %}
9498   ins_pipe( pipe_slow );
9499 %}
9500 
9501 #ifdef _LP64
9502 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
9503   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
9504   match(Set dst (XorVMask src (MaskAll cnt)));
9505   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
9506   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
9507   ins_encode %{
9508     uint masklen = Matcher::vector_length(this);
9509     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
9510   %}
9511   ins_pipe( pipe_slow );
9512 %}
9513 
9514 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
9515   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
9516             (Matcher::vector_length(n) == 16) ||
9517             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
9518   match(Set dst (XorVMask src (MaskAll cnt)));
9519   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
9520   ins_encode %{
9521     uint masklen = Matcher::vector_length(this);
9522     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
9523   %}
9524   ins_pipe( pipe_slow );
9525 %}
9526 
9527 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
9528   predicate(n->bottom_type()->isa_vectmask() == NULL && Matcher::vector_length(n) <= 8);
9529   match(Set dst (VectorLongToMask src));
9530   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
9531   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
9532   ins_encode %{
9533     int mask_len = Matcher::vector_length(this);
9534     int vec_enc  = vector_length_encoding(mask_len);
9535     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
9536                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
9537   %}
9538   ins_pipe( pipe_slow );
9539 %}
9540 
9541 
9542 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
9543   predicate(n->bottom_type()->isa_vectmask() == NULL && Matcher::vector_length(n) > 8);
9544   match(Set dst (VectorLongToMask src));
9545   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
9546   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
9547   ins_encode %{
9548     int mask_len = Matcher::vector_length(this);
9549     assert(mask_len <= 32, "invalid mask length");
9550     int vec_enc  = vector_length_encoding(mask_len);
9551     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
9552                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
9553   %}
9554   ins_pipe( pipe_slow );
9555 %}
9556 
9557 instruct long_to_mask_evex(kReg dst, rRegL src) %{
9558   predicate(n->bottom_type()->isa_vectmask());
9559   match(Set dst (VectorLongToMask src));
9560   format %{ "long_to_mask_evex $dst, $src\t!" %}
9561   ins_encode %{
9562     __ kmov($dst$$KRegister, $src$$Register);
9563   %}
9564   ins_pipe( pipe_slow );
9565 %}
9566 #endif
9567 
9568 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
9569   match(Set dst (AndVMask src1 src2));
9570   match(Set dst (OrVMask src1 src2));
9571   match(Set dst (XorVMask src1 src2));
9572   effect(TEMP kscratch);
9573   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
9574   ins_encode %{
9575     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
9576     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
9577     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
9578     uint masklen = Matcher::vector_length(this);
9579     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
9580     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
9581   %}
9582   ins_pipe( pipe_slow );
9583 %}
9584 
9585 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
9586   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
9587   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
9588   ins_encode %{
9589     int vlen_enc = vector_length_encoding(this);
9590     BasicType bt = Matcher::vector_element_basic_type(this);
9591     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
9592                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
9593   %}
9594   ins_pipe( pipe_slow );
9595 %}
9596 
9597 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
9598   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
9599   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
9600   ins_encode %{
9601     int vlen_enc = vector_length_encoding(this);
9602     BasicType bt = Matcher::vector_element_basic_type(this);
9603     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
9604                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
9605   %}
9606   ins_pipe( pipe_slow );
9607 %}
9608 
9609 instruct castMM(kReg dst)
9610 %{
9611   match(Set dst (CastVV dst));
9612 
9613   size(0);
9614   format %{ "# castVV of $dst" %}
9615   ins_encode(/* empty encoding */);
9616   ins_cost(0);
9617   ins_pipe(empty);
9618 %}
9619 
9620 instruct castVV(vec dst)
9621 %{
9622   match(Set dst (CastVV dst));
9623 
9624   size(0);
9625   format %{ "# castVV of $dst" %}
9626   ins_encode(/* empty encoding */);
9627   ins_cost(0);
9628   ins_pipe(empty);
9629 %}
9630 
9631 instruct castVVLeg(legVec dst)
9632 %{
9633   match(Set dst (CastVV dst));
9634 
9635   size(0);
9636   format %{ "# castVV of $dst" %}
9637   ins_encode(/* empty encoding */);
9638   ins_cost(0);
9639   ins_pipe(empty);
9640 %}