1 //
   2 // Copyright (c) 2011, 2021, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 // AVX3 Mask Registers.
 632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
 633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
 634 
 635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
 636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
 637 
 638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
 639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
 640 
 641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
 642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
 643 
 644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
 645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
 646 
 647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
 648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
 649 
 650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
 651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
 652 
 653 
 654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 662 #ifdef _LP64
 663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 687 #endif
 688                       );
 689 
 690 alloc_class chunk2(K7, K7_H,
 691                    K6, K6_H,
 692                    K5, K5_H,
 693                    K4, K4_H,
 694                    K3, K3_H,
 695                    K2, K2_H,
 696                    K1, K1_H);
 697 
 698 reg_class  vectmask_reg(K1, K1_H,
 699                         K2, K2_H,
 700                         K3, K3_H,
 701                         K4, K4_H,
 702                         K5, K5_H,
 703                         K6, K6_H,
 704                         K7, K7_H);
 705 
 706 reg_class vectmask_reg_K1(K1, K1_H);
 707 reg_class vectmask_reg_K2(K2, K2_H);
 708 reg_class vectmask_reg_K3(K3, K3_H);
 709 reg_class vectmask_reg_K4(K4, K4_H);
 710 reg_class vectmask_reg_K5(K5, K5_H);
 711 reg_class vectmask_reg_K6(K6, K6_H);
 712 reg_class vectmask_reg_K7(K7, K7_H);
 713 
 714 // flags allocation class should be last.
 715 alloc_class chunk3(RFLAGS);
 716 
 717 
 718 // Singleton class for condition codes
 719 reg_class int_flags(RFLAGS);
 720 
 721 // Class for pre evex float registers
 722 reg_class float_reg_legacy(XMM0,
 723                     XMM1,
 724                     XMM2,
 725                     XMM3,
 726                     XMM4,
 727                     XMM5,
 728                     XMM6,
 729                     XMM7
 730 #ifdef _LP64
 731                    ,XMM8,
 732                     XMM9,
 733                     XMM10,
 734                     XMM11,
 735                     XMM12,
 736                     XMM13,
 737                     XMM14,
 738                     XMM15
 739 #endif
 740                     );
 741 
 742 // Class for evex float registers
 743 reg_class float_reg_evex(XMM0,
 744                     XMM1,
 745                     XMM2,
 746                     XMM3,
 747                     XMM4,
 748                     XMM5,
 749                     XMM6,
 750                     XMM7
 751 #ifdef _LP64
 752                    ,XMM8,
 753                     XMM9,
 754                     XMM10,
 755                     XMM11,
 756                     XMM12,
 757                     XMM13,
 758                     XMM14,
 759                     XMM15,
 760                     XMM16,
 761                     XMM17,
 762                     XMM18,
 763                     XMM19,
 764                     XMM20,
 765                     XMM21,
 766                     XMM22,
 767                     XMM23,
 768                     XMM24,
 769                     XMM25,
 770                     XMM26,
 771                     XMM27,
 772                     XMM28,
 773                     XMM29,
 774                     XMM30,
 775                     XMM31
 776 #endif
 777                     );
 778 
 779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 781 
 782 // Class for pre evex double registers
 783 reg_class double_reg_legacy(XMM0,  XMM0b,
 784                      XMM1,  XMM1b,
 785                      XMM2,  XMM2b,
 786                      XMM3,  XMM3b,
 787                      XMM4,  XMM4b,
 788                      XMM5,  XMM5b,
 789                      XMM6,  XMM6b,
 790                      XMM7,  XMM7b
 791 #ifdef _LP64
 792                     ,XMM8,  XMM8b,
 793                      XMM9,  XMM9b,
 794                      XMM10, XMM10b,
 795                      XMM11, XMM11b,
 796                      XMM12, XMM12b,
 797                      XMM13, XMM13b,
 798                      XMM14, XMM14b,
 799                      XMM15, XMM15b
 800 #endif
 801                      );
 802 
 803 // Class for evex double registers
 804 reg_class double_reg_evex(XMM0,  XMM0b,
 805                      XMM1,  XMM1b,
 806                      XMM2,  XMM2b,
 807                      XMM3,  XMM3b,
 808                      XMM4,  XMM4b,
 809                      XMM5,  XMM5b,
 810                      XMM6,  XMM6b,
 811                      XMM7,  XMM7b
 812 #ifdef _LP64
 813                     ,XMM8,  XMM8b,
 814                      XMM9,  XMM9b,
 815                      XMM10, XMM10b,
 816                      XMM11, XMM11b,
 817                      XMM12, XMM12b,
 818                      XMM13, XMM13b,
 819                      XMM14, XMM14b,
 820                      XMM15, XMM15b,
 821                      XMM16, XMM16b,
 822                      XMM17, XMM17b,
 823                      XMM18, XMM18b,
 824                      XMM19, XMM19b,
 825                      XMM20, XMM20b,
 826                      XMM21, XMM21b,
 827                      XMM22, XMM22b,
 828                      XMM23, XMM23b,
 829                      XMM24, XMM24b,
 830                      XMM25, XMM25b,
 831                      XMM26, XMM26b,
 832                      XMM27, XMM27b,
 833                      XMM28, XMM28b,
 834                      XMM29, XMM29b,
 835                      XMM30, XMM30b,
 836                      XMM31, XMM31b
 837 #endif
 838                      );
 839 
 840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 842 
 843 // Class for pre evex 32bit vector registers
 844 reg_class vectors_reg_legacy(XMM0,
 845                       XMM1,
 846                       XMM2,
 847                       XMM3,
 848                       XMM4,
 849                       XMM5,
 850                       XMM6,
 851                       XMM7
 852 #ifdef _LP64
 853                      ,XMM8,
 854                       XMM9,
 855                       XMM10,
 856                       XMM11,
 857                       XMM12,
 858                       XMM13,
 859                       XMM14,
 860                       XMM15
 861 #endif
 862                       );
 863 
 864 // Class for evex 32bit vector registers
 865 reg_class vectors_reg_evex(XMM0,
 866                       XMM1,
 867                       XMM2,
 868                       XMM3,
 869                       XMM4,
 870                       XMM5,
 871                       XMM6,
 872                       XMM7
 873 #ifdef _LP64
 874                      ,XMM8,
 875                       XMM9,
 876                       XMM10,
 877                       XMM11,
 878                       XMM12,
 879                       XMM13,
 880                       XMM14,
 881                       XMM15,
 882                       XMM16,
 883                       XMM17,
 884                       XMM18,
 885                       XMM19,
 886                       XMM20,
 887                       XMM21,
 888                       XMM22,
 889                       XMM23,
 890                       XMM24,
 891                       XMM25,
 892                       XMM26,
 893                       XMM27,
 894                       XMM28,
 895                       XMM29,
 896                       XMM30,
 897                       XMM31
 898 #endif
 899                       );
 900 
 901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 903 
 904 // Class for all 64bit vector registers
 905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 906                       XMM1,  XMM1b,
 907                       XMM2,  XMM2b,
 908                       XMM3,  XMM3b,
 909                       XMM4,  XMM4b,
 910                       XMM5,  XMM5b,
 911                       XMM6,  XMM6b,
 912                       XMM7,  XMM7b
 913 #ifdef _LP64
 914                      ,XMM8,  XMM8b,
 915                       XMM9,  XMM9b,
 916                       XMM10, XMM10b,
 917                       XMM11, XMM11b,
 918                       XMM12, XMM12b,
 919                       XMM13, XMM13b,
 920                       XMM14, XMM14b,
 921                       XMM15, XMM15b
 922 #endif
 923                       );
 924 
 925 // Class for all 64bit vector registers
 926 reg_class vectord_reg_evex(XMM0,  XMM0b,
 927                       XMM1,  XMM1b,
 928                       XMM2,  XMM2b,
 929                       XMM3,  XMM3b,
 930                       XMM4,  XMM4b,
 931                       XMM5,  XMM5b,
 932                       XMM6,  XMM6b,
 933                       XMM7,  XMM7b
 934 #ifdef _LP64
 935                      ,XMM8,  XMM8b,
 936                       XMM9,  XMM9b,
 937                       XMM10, XMM10b,
 938                       XMM11, XMM11b,
 939                       XMM12, XMM12b,
 940                       XMM13, XMM13b,
 941                       XMM14, XMM14b,
 942                       XMM15, XMM15b,
 943                       XMM16, XMM16b,
 944                       XMM17, XMM17b,
 945                       XMM18, XMM18b,
 946                       XMM19, XMM19b,
 947                       XMM20, XMM20b,
 948                       XMM21, XMM21b,
 949                       XMM22, XMM22b,
 950                       XMM23, XMM23b,
 951                       XMM24, XMM24b,
 952                       XMM25, XMM25b,
 953                       XMM26, XMM26b,
 954                       XMM27, XMM27b,
 955                       XMM28, XMM28b,
 956                       XMM29, XMM29b,
 957                       XMM30, XMM30b,
 958                       XMM31, XMM31b
 959 #endif
 960                       );
 961 
 962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 964 
 965 // Class for all 128bit vector registers
 966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 973                       XMM7,  XMM7b,  XMM7c,  XMM7d
 974 #ifdef _LP64
 975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 977                       XMM10, XMM10b, XMM10c, XMM10d,
 978                       XMM11, XMM11b, XMM11c, XMM11d,
 979                       XMM12, XMM12b, XMM12c, XMM12d,
 980                       XMM13, XMM13b, XMM13c, XMM13d,
 981                       XMM14, XMM14b, XMM14c, XMM14d,
 982                       XMM15, XMM15b, XMM15c, XMM15d
 983 #endif
 984                       );
 985 
 986 // Class for all 128bit vector registers
 987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 994                       XMM7,  XMM7b,  XMM7c,  XMM7d
 995 #ifdef _LP64
 996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 998                       XMM10, XMM10b, XMM10c, XMM10d,
 999                       XMM11, XMM11b, XMM11c, XMM11d,
1000                       XMM12, XMM12b, XMM12c, XMM12d,
1001                       XMM13, XMM13b, XMM13c, XMM13d,
1002                       XMM14, XMM14b, XMM14c, XMM14d,
1003                       XMM15, XMM15b, XMM15c, XMM15d,
1004                       XMM16, XMM16b, XMM16c, XMM16d,
1005                       XMM17, XMM17b, XMM17c, XMM17d,
1006                       XMM18, XMM18b, XMM18c, XMM18d,
1007                       XMM19, XMM19b, XMM19c, XMM19d,
1008                       XMM20, XMM20b, XMM20c, XMM20d,
1009                       XMM21, XMM21b, XMM21c, XMM21d,
1010                       XMM22, XMM22b, XMM22c, XMM22d,
1011                       XMM23, XMM23b, XMM23c, XMM23d,
1012                       XMM24, XMM24b, XMM24c, XMM24d,
1013                       XMM25, XMM25b, XMM25c, XMM25d,
1014                       XMM26, XMM26b, XMM26c, XMM26d,
1015                       XMM27, XMM27b, XMM27c, XMM27d,
1016                       XMM28, XMM28b, XMM28c, XMM28d,
1017                       XMM29, XMM29b, XMM29c, XMM29d,
1018                       XMM30, XMM30b, XMM30c, XMM30d,
1019                       XMM31, XMM31b, XMM31c, XMM31d
1020 #endif
1021                       );
1022 
1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1025 
1026 // Class for all 256bit vector registers
1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1035 #ifdef _LP64
1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
1044 #endif
1045                       );
1046 
1047 // Class for all 256bit vector registers
1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1056 #ifdef _LP64
1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1081 #endif
1082                       );
1083 
1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1086 
1087 // Class for all 512bit vector registers
1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1096 #ifdef _LP64
1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1121 #endif
1122                       );
1123 
1124 // Class for restricted 512bit vector registers
1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1133 #ifdef _LP64
1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1142 #endif
1143                       );
1144 
1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1147 
1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1149 %}
1150 
1151 
1152 //----------SOURCE BLOCK-------------------------------------------------------
1153 // This is a block of C++ code which provides values, functions, and
1154 // definitions necessary in the rest of the architecture description
1155 
1156 source_hpp %{
1157 // Header information of the source block.
1158 // Method declarations/definitions which are used outside
1159 // the ad-scope can conveniently be defined here.
1160 //
1161 // To keep related declarations/definitions/uses close together,
1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1163 
1164 #include "runtime/vm_version.hpp"
1165 
1166 class NativeJump;
1167 
1168 class CallStubImpl {
1169 
1170   //--------------------------------------------------------------
1171   //---<  Used for optimization in Compile::shorten_branches  >---
1172   //--------------------------------------------------------------
1173 
1174  public:
1175   // Size of call trampoline stub.
1176   static uint size_call_trampoline() {
1177     return 0; // no call trampolines on this platform
1178   }
1179 
1180   // number of relocations needed by a call trampoline stub
1181   static uint reloc_call_trampoline() {
1182     return 0; // no call trampolines on this platform
1183   }
1184 };
1185 
1186 class HandlerImpl {
1187 
1188  public:
1189 
1190   static int emit_exception_handler(CodeBuffer &cbuf);
1191   static int emit_deopt_handler(CodeBuffer& cbuf);
1192 
1193   static uint size_exception_handler() {
1194     // NativeCall instruction size is the same as NativeJump.
1195     // exception handler starts out as jump and can be patched to
1196     // a call be deoptimization.  (4932387)
1197     // Note that this value is also credited (in output.cpp) to
1198     // the size of the code section.
1199     return NativeJump::instruction_size;
1200   }
1201 
1202 #ifdef _LP64
1203   static uint size_deopt_handler() {
1204     // three 5 byte instructions plus one move for unreachable address.
1205     return 15+3;
1206   }
1207 #else
1208   static uint size_deopt_handler() {
1209     // NativeCall instruction size is the same as NativeJump.
1210     // exception handler starts out as jump and can be patched to
1211     // a call be deoptimization.  (4932387)
1212     // Note that this value is also credited (in output.cpp) to
1213     // the size of the code section.
1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1215   }
1216 #endif
1217 };
1218 
1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
1220   switch(bytes) {
1221     case  4: // fall-through
1222     case  8: // fall-through
1223     case 16: return Assembler::AVX_128bit;
1224     case 32: return Assembler::AVX_256bit;
1225     case 64: return Assembler::AVX_512bit;
1226 
1227     default: {
1228       ShouldNotReachHere();
1229       return Assembler::AVX_NoVec;
1230     }
1231   }
1232 }
1233 
1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
1236 }
1237 
1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
1239   uint def_idx = use->operand_index(opnd);
1240   Node* def = use->in(def_idx);
1241   return vector_length_encoding(def);
1242 }
1243 
1244 static inline bool is_unsigned_booltest_pred(int bt) {
1245   return  ((bt & BoolTest::unsigned_compare) == BoolTest::unsigned_compare);
1246 }
1247 
1248 class Node::PD {
1249 public:
1250   enum NodeFlags {
1251     Flag_intel_jcc_erratum = Node::_last_flag << 1,
1252     _last_flag             = Flag_intel_jcc_erratum
1253   };
1254 };
1255 
1256 %} // end source_hpp
1257 
1258 source %{
1259 
1260 #include "opto/addnode.hpp"
1261 #include "c2_intelJccErratum_x86.hpp"
1262 
1263 void PhaseOutput::pd_perform_mach_node_analysis() {
1264   if (VM_Version::has_intel_jcc_erratum()) {
1265     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
1266     _buf_sizes._code += extra_padding;
1267   }
1268 }
1269 
1270 int MachNode::pd_alignment_required() const {
1271   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
1272     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
1273     return IntelJccErratum::largest_jcc_size() + 1;
1274   } else {
1275     return 1;
1276   }
1277 }
1278 
1279 int MachNode::compute_padding(int current_offset) const {
1280   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
1281     Compile* C = Compile::current();
1282     PhaseOutput* output = C->output();
1283     Block* block = output->block();
1284     int index = output->index();
1285     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
1286   } else {
1287     return 0;
1288   }
1289 }
1290 
1291 // Emit exception handler code.
1292 // Stuff framesize into a register and call a VM stub routine.
1293 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1294 
1295   // Note that the code buffer's insts_mark is always relative to insts.
1296   // That's why we must use the macroassembler to generate a handler.
1297   C2_MacroAssembler _masm(&cbuf);
1298   address base = __ start_a_stub(size_exception_handler());
1299   if (base == NULL) {
1300     ciEnv::current()->record_failure("CodeCache is full");
1301     return 0;  // CodeBuffer::expand failed
1302   }
1303   int offset = __ offset();
1304   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1305   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1306   __ end_a_stub();
1307   return offset;
1308 }
1309 
1310 // Emit deopt handler code.
1311 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1312 
1313   // Note that the code buffer's insts_mark is always relative to insts.
1314   // That's why we must use the macroassembler to generate a handler.
1315   C2_MacroAssembler _masm(&cbuf);
1316   address base = __ start_a_stub(size_deopt_handler());
1317   if (base == NULL) {
1318     ciEnv::current()->record_failure("CodeCache is full");
1319     return 0;  // CodeBuffer::expand failed
1320   }
1321   int offset = __ offset();
1322 
1323 #ifdef _LP64
1324   address the_pc = (address) __ pc();
1325   Label next;
1326   // push a "the_pc" on the stack without destroying any registers
1327   // as they all may be live.
1328 
1329   // push address of "next"
1330   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1331   __ bind(next);
1332   // adjust it so it matches "the_pc"
1333   __ subptr(Address(rsp, 0), __ offset() - offset);
1334 #else
1335   InternalAddress here(__ pc());
1336   __ pushptr(here.addr());
1337 #endif
1338 
1339   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1340   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1341   __ end_a_stub();
1342   return offset;
1343 }
1344 
1345 Assembler::Width widthForType(BasicType bt) {
1346   if (bt == T_BYTE) {
1347     return Assembler::B;
1348   } else if (bt == T_SHORT) {
1349     return Assembler::W;
1350   } else if (bt == T_INT) {
1351     return Assembler::D;
1352   } else {
1353     assert(bt == T_LONG, "not a long: %s", type2name(bt));
1354     return Assembler::Q;
1355   }
1356 }
1357 
1358 //=============================================================================
1359 
1360   // Float masks come from different places depending on platform.
1361 #ifdef _LP64
1362   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1363   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1364   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1365   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1366 #else
1367   static address float_signmask()  { return (address)float_signmask_pool; }
1368   static address float_signflip()  { return (address)float_signflip_pool; }
1369   static address double_signmask() { return (address)double_signmask_pool; }
1370   static address double_signflip() { return (address)double_signflip_pool; }
1371 #endif
1372   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1373   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1374   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1375   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1376   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1377   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
1378   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1379   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
1380   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1381   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1382   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1383   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1384   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1385 
1386 //=============================================================================
1387 const bool Matcher::match_rule_supported(int opcode) {
1388   if (!has_match_rule(opcode)) {
1389     return false; // no match rule present
1390   }
1391   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1392   switch (opcode) {
1393     case Op_AbsVL:
1394     case Op_StoreVectorScatter:
1395       if (UseAVX < 3) {
1396         return false;
1397       }
1398       break;
1399     case Op_PopCountI:
1400     case Op_PopCountL:
1401       if (!UsePopCountInstruction) {
1402         return false;
1403       }
1404       break;
1405     case Op_PopCountVI:
1406       if (!UsePopCountInstruction || !VM_Version::supports_avx512_vpopcntdq()) {
1407         return false;
1408       }
1409       break;
1410     case Op_MulVI:
1411       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
1412         return false;
1413       }
1414       break;
1415     case Op_MulVL:
1416       if (UseSSE < 4) { // only with SSE4_1 or AVX
1417         return false;
1418       }
1419       break;
1420     case Op_MulReductionVL:
1421       if (VM_Version::supports_avx512dq() == false) {
1422         return false;
1423       }
1424       break;
1425     case Op_AddReductionVL:
1426       if (UseSSE < 2) { // requires at least SSE2
1427         return false;
1428       }
1429       break;
1430     case Op_AbsVB:
1431     case Op_AbsVS:
1432     case Op_AbsVI:
1433     case Op_AddReductionVI:
1434     case Op_AndReductionV:
1435     case Op_OrReductionV:
1436     case Op_XorReductionV:
1437       if (UseSSE < 3) { // requires at least SSSE3
1438         return false;
1439       }
1440       break;
1441     case Op_VectorLoadShuffle:
1442     case Op_VectorRearrange:
1443     case Op_MulReductionVI:
1444       if (UseSSE < 4) { // requires at least SSE4
1445         return false;
1446       }
1447       break;
1448     case Op_SqrtVD:
1449     case Op_SqrtVF:
1450     case Op_VectorMaskCmp:
1451     case Op_VectorCastB2X:
1452     case Op_VectorCastS2X:
1453     case Op_VectorCastI2X:
1454     case Op_VectorCastL2X:
1455     case Op_VectorCastF2X:
1456     case Op_VectorCastD2X:
1457       if (UseAVX < 1) { // enabled for AVX only
1458         return false;
1459       }
1460       break;
1461     case Op_CompareAndSwapL:
1462 #ifdef _LP64
1463     case Op_CompareAndSwapP:
1464 #endif
1465       if (!VM_Version::supports_cx8()) {
1466         return false;
1467       }
1468       break;
1469     case Op_CMoveVF:
1470     case Op_CMoveVD:
1471       if (UseAVX < 1) { // enabled for AVX only
1472         return false;
1473       }
1474       break;
1475     case Op_StrIndexOf:
1476       if (!UseSSE42Intrinsics) {
1477         return false;
1478       }
1479       break;
1480     case Op_StrIndexOfChar:
1481       if (!UseSSE42Intrinsics) {
1482         return false;
1483       }
1484       break;
1485     case Op_OnSpinWait:
1486       if (VM_Version::supports_on_spin_wait() == false) {
1487         return false;
1488       }
1489       break;
1490     case Op_MulVB:
1491     case Op_LShiftVB:
1492     case Op_RShiftVB:
1493     case Op_URShiftVB:
1494     case Op_VectorInsert:
1495     case Op_VectorLoadMask:
1496     case Op_VectorStoreMask:
1497     case Op_VectorBlend:
1498       if (UseSSE < 4) {
1499         return false;
1500       }
1501       break;
1502 #ifdef _LP64
1503     case Op_MaxD:
1504     case Op_MaxF:
1505     case Op_MinD:
1506     case Op_MinF:
1507       if (UseAVX < 1) { // enabled for AVX only
1508         return false;
1509       }
1510       break;
1511 #endif
1512     case Op_CacheWB:
1513     case Op_CacheWBPreSync:
1514     case Op_CacheWBPostSync:
1515       if (!VM_Version::supports_data_cache_line_flush()) {
1516         return false;
1517       }
1518       break;
1519     case Op_ExtractB:
1520     case Op_ExtractL:
1521     case Op_ExtractI:
1522     case Op_RoundDoubleMode:
1523       if (UseSSE < 4) {
1524         return false;
1525       }
1526       break;
1527     case Op_RoundDoubleModeV:
1528       if (VM_Version::supports_avx() == false) {
1529         return false; // 128bit vroundpd is not available
1530       }
1531       break;
1532     case Op_LoadVectorGather:
1533       if (UseAVX < 2) {
1534         return false;
1535       }
1536       break;
1537     case Op_FmaVD:
1538     case Op_FmaVF:
1539       if (!UseFMA) {
1540         return false;
1541       }
1542       break;
1543     case Op_MacroLogicV:
1544       if (UseAVX < 3 || !UseVectorMacroLogic) {
1545         return false;
1546       }
1547       break;
1548 
1549     case Op_VectorCmpMasked:
1550     case Op_VectorMaskGen:
1551     case Op_LoadVectorMasked:
1552     case Op_StoreVectorMasked:
1553       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
1554         return false;
1555       }
1556       break;
1557     case Op_VectorMaskFirstTrue:
1558     case Op_VectorMaskLastTrue:
1559     case Op_VectorMaskTrueCount:
1560     case Op_VectorMaskToLong:
1561       if (!is_LP64 || UseAVX < 1) {
1562          return false;
1563       }
1564       break;
1565     case Op_CopySignD:
1566     case Op_CopySignF:
1567       if (UseAVX < 3 || !is_LP64)  {
1568         return false;
1569       }
1570       if (!VM_Version::supports_avx512vl()) {
1571         return false;
1572       }
1573       break;
1574 #ifndef _LP64
1575     case Op_AddReductionVF:
1576     case Op_AddReductionVD:
1577     case Op_MulReductionVF:
1578     case Op_MulReductionVD:
1579       if (UseSSE < 1) { // requires at least SSE
1580         return false;
1581       }
1582       break;
1583     case Op_MulAddVS2VI:
1584     case Op_RShiftVL:
1585     case Op_AbsVD:
1586     case Op_NegVD:
1587       if (UseSSE < 2) {
1588         return false;
1589       }
1590       break;
1591 #endif // !LP64
1592     case Op_SignumF:
1593       if (UseSSE < 1) {
1594         return false;
1595       }
1596       break;
1597     case Op_SignumD:
1598       if (UseSSE < 2) {
1599         return false;
1600       }
1601       break;
1602   }
1603   return true;  // Match rules are supported by default.
1604 }
1605 
1606 //------------------------------------------------------------------------
1607 
1608 // Identify extra cases that we might want to provide match rules for vector nodes and
1609 // other intrinsics guarded with vector length (vlen) and element type (bt).
1610 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
1611   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1612   if (!match_rule_supported(opcode)) {
1613     return false;
1614   }
1615   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
1616   //   * SSE2 supports 128bit vectors for all types;
1617   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
1618   //   * AVX2 supports 256bit vectors for all types;
1619   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
1620   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1621   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1622   // And MaxVectorSize is taken into account as well.
1623   if (!vector_size_supported(bt, vlen)) {
1624     return false;
1625   }
1626   // Special cases which require vector length follow:
1627   //   * implementation limitations
1628   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1629   //   * 128bit vroundpd instruction is present only in AVX1
1630   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1631   switch (opcode) {
1632     case Op_AbsVF:
1633     case Op_NegVF:
1634       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1635         return false; // 512bit vandps and vxorps are not available
1636       }
1637       break;
1638     case Op_AbsVD:
1639     case Op_NegVD:
1640     case Op_MulVL:
1641       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1642         return false; // 512bit vpmullq, vandpd and vxorpd are not available
1643       }
1644       break;
1645     case Op_CMoveVF:
1646       if (vlen != 8) {
1647         return false; // implementation limitation (only vcmov8F_reg is present)
1648       }
1649       break;
1650     case Op_RotateRightV:
1651     case Op_RotateLeftV:
1652       if (bt != T_INT && bt != T_LONG) {
1653         return false;
1654       } // fallthrough
1655     case Op_MacroLogicV:
1656       if (!VM_Version::supports_evex() ||
1657           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
1658         return false;
1659       }
1660       break;
1661     case Op_ClearArray:
1662     case Op_VectorMaskGen:
1663     case Op_VectorCmpMasked:
1664     case Op_LoadVectorMasked:
1665     case Op_StoreVectorMasked:
1666       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
1667         return false;
1668       }
1669       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
1670         return false;
1671       }
1672       break;
1673     case Op_CMoveVD:
1674       if (vlen != 4) {
1675         return false; // implementation limitation (only vcmov4D_reg is present)
1676       }
1677       break;
1678     case Op_MaxV:
1679     case Op_MinV:
1680       if (UseSSE < 4 && is_integral_type(bt)) {
1681         return false;
1682       }
1683       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
1684           // Float/Double intrinsics are enabled for AVX family currently.
1685           if (UseAVX == 0) {
1686             return false;
1687           }
1688           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
1689             return false;
1690           }
1691       }
1692       break;
1693     case Op_CallLeafVector:
1694       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
1695         return false;
1696       }
1697       break;
1698     case Op_AddReductionVI:
1699       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
1700         return false;
1701       }
1702       // fallthrough
1703     case Op_AndReductionV:
1704     case Op_OrReductionV:
1705     case Op_XorReductionV:
1706       if (is_subword_type(bt) && (UseSSE < 4)) {
1707         return false;
1708       }
1709 #ifndef _LP64
1710       if (bt == T_BYTE || bt == T_LONG) {
1711         return false;
1712       }
1713 #endif
1714       break;
1715 #ifndef _LP64
1716     case Op_VectorInsert:
1717       if (bt == T_LONG || bt == T_DOUBLE) {
1718         return false;
1719       }
1720       break;
1721 #endif
1722     case Op_MinReductionV:
1723     case Op_MaxReductionV:
1724       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
1725         return false;
1726       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
1727         return false;
1728       }
1729       // Float/Double intrinsics enabled for AVX family.
1730       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
1731         return false;
1732       }
1733       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
1734         return false;
1735       }
1736 #ifndef _LP64
1737       if (bt == T_BYTE || bt == T_LONG) {
1738         return false;
1739       }
1740 #endif
1741       break;
1742     case Op_VectorTest:
1743       if (UseSSE < 4) {
1744         return false; // Implementation limitation
1745       } else if (size_in_bits < 32) {
1746         return false; // Implementation limitation
1747       } else if (size_in_bits == 512 && (VM_Version::supports_avx512bw() == false)) {
1748         return false; // Implementation limitation
1749       }
1750       break;
1751     case Op_VectorLoadShuffle:
1752     case Op_VectorRearrange:
1753       if(vlen == 2) {
1754         return false; // Implementation limitation due to how shuffle is loaded
1755       } else if (size_in_bits == 256 && UseAVX < 2) {
1756         return false; // Implementation limitation
1757       } else if (bt == T_BYTE && size_in_bits > 256 && !VM_Version::supports_avx512_vbmi())  {
1758         return false; // Implementation limitation
1759       } else if (bt == T_SHORT && size_in_bits > 256 && !VM_Version::supports_avx512bw())  {
1760         return false; // Implementation limitation
1761       }
1762       break;
1763     case Op_VectorLoadMask:
1764       if (size_in_bits == 256 && UseAVX < 2) {
1765         return false; // Implementation limitation
1766       }
1767       // fallthrough
1768     case Op_VectorStoreMask:
1769       if (vlen == 2) {
1770         return false; // Implementation limitation
1771       }
1772       break;
1773     case Op_VectorCastB2X:
1774       if (size_in_bits == 256 && UseAVX < 2) {
1775         return false; // Implementation limitation
1776       }
1777       break;
1778     case Op_VectorCastS2X:
1779       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1780         return false;
1781       }
1782       break;
1783     case Op_VectorCastI2X:
1784       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1785         return false;
1786       }
1787       break;
1788     case Op_VectorCastL2X:
1789       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1790         return false;
1791       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
1792         return false;
1793       }
1794       break;
1795     case Op_VectorCastF2X:
1796     case Op_VectorCastD2X:
1797       if (is_integral_type(bt)) {
1798         // Casts from FP to integral types require special fixup logic not easily
1799         // implementable with vectors.
1800         return false; // Implementation limitation
1801       }
1802     case Op_MulReductionVI:
1803       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
1804         return false;
1805       }
1806       break;
1807     case Op_LoadVectorGatherMasked:
1808     case Op_StoreVectorScatterMasked:
1809     case Op_StoreVectorScatter:
1810       if(is_subword_type(bt)) {
1811         return false;
1812       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1813         return false;
1814       }
1815       // fallthrough
1816     case Op_LoadVectorGather:
1817       if (size_in_bits == 64 ) {
1818         return false;
1819       }
1820       break;
1821     case Op_MaskAll:
1822       if (!is_LP64 || !VM_Version::supports_evex()) {
1823         return false;
1824       }
1825       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
1826         return false;
1827       }
1828       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1829         return false;
1830       }
1831       break;
1832     case Op_VectorMaskCmp:
1833       if (vlen < 2 || size_in_bits < 32) {
1834         return false;
1835       }
1836       break;
1837   }
1838   return true;  // Per default match rules are supported.
1839 }
1840 
1841 const bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
1842   // ADLC based match_rule_supported routine checks for the existence of pattern based
1843   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
1844   // of their non-masked counterpart with mask edge being the differentiator.
1845   // This routine does a strict check on the existence of masked operation patterns
1846   // by returning a default false value for all the other opcodes apart from the
1847   // ones whose masked instruction patterns are defined in this file.
1848   if (!match_rule_supported_vector(opcode, vlen, bt)) {
1849     return false;
1850   }
1851 
1852   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1853   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1854   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
1855     return false;
1856   }
1857   switch(opcode) {
1858     // Unary masked operations
1859     case Op_AbsVB:
1860     case Op_AbsVS:
1861       if(!VM_Version::supports_avx512bw()) {
1862         return false;  // Implementation limitation
1863       }
1864     case Op_AbsVI:
1865     case Op_AbsVL:
1866       return true;
1867 
1868     // Ternary masked operations
1869     case Op_FmaVF:
1870     case Op_FmaVD:
1871       return true;
1872 
1873     // Binary masked operations
1874     case Op_AddVB:
1875     case Op_AddVS:
1876     case Op_SubVB:
1877     case Op_SubVS:
1878     case Op_MulVS:
1879     case Op_LShiftVS:
1880     case Op_RShiftVS:
1881     case Op_URShiftVS:
1882       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
1883       if (!VM_Version::supports_avx512bw()) {
1884         return false;  // Implementation limitation
1885       }
1886       return true;
1887 
1888     case Op_MulVL:
1889       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
1890       if (!VM_Version::supports_avx512dq()) {
1891         return false;  // Implementation limitation
1892       }
1893       return true;
1894 
1895     case Op_AndV:
1896     case Op_OrV:
1897     case Op_XorV:
1898     case Op_RotateRightV:
1899     case Op_RotateLeftV:
1900       if (bt != T_INT && bt != T_LONG) {
1901         return false; // Implementation limitation
1902       }
1903       return true;
1904 
1905     case Op_VectorLoadMask:
1906       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
1907       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
1908         return false;
1909       }
1910       return true;
1911 
1912     case Op_AddVI:
1913     case Op_AddVL:
1914     case Op_AddVF:
1915     case Op_AddVD:
1916     case Op_SubVI:
1917     case Op_SubVL:
1918     case Op_SubVF:
1919     case Op_SubVD:
1920     case Op_MulVI:
1921     case Op_MulVF:
1922     case Op_MulVD:
1923     case Op_DivVF:
1924     case Op_DivVD:
1925     case Op_SqrtVF:
1926     case Op_SqrtVD:
1927     case Op_LShiftVI:
1928     case Op_LShiftVL:
1929     case Op_RShiftVI:
1930     case Op_RShiftVL:
1931     case Op_URShiftVI:
1932     case Op_URShiftVL:
1933     case Op_LoadVectorMasked:
1934     case Op_StoreVectorMasked:
1935     case Op_LoadVectorGatherMasked:
1936     case Op_StoreVectorScatterMasked:
1937       return true;
1938 
1939     case Op_MaxV:
1940     case Op_MinV:
1941       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
1942         return false; // Implementation limitation
1943       }
1944       if (is_floating_point_type(bt)) {
1945         return false; // Implementation limitation
1946       }
1947       return true;
1948 
1949     case Op_VectorMaskCmp:
1950       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
1951         return false; // Implementation limitation
1952       }
1953       return true;
1954 
1955     case Op_VectorRearrange:
1956       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
1957         return false; // Implementation limitation
1958       }
1959       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
1960         return false; // Implementation limitation
1961       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
1962         return false; // Implementation limitation
1963       }
1964       return true;
1965 
1966     // Binary Logical operations
1967     case Op_AndVMask:
1968     case Op_OrVMask:
1969     case Op_XorVMask:
1970       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
1971         return false; // Implementation limitation
1972       }
1973       return true;
1974 
1975     case Op_MaskAll:
1976       return true;
1977 
1978     default:
1979       return false;
1980   }
1981 }
1982 
1983 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
1984   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
1985   bool legacy = (generic_opnd->opcode() == LEGVEC);
1986   if (!VM_Version::supports_avx512vlbwdq() && // KNL
1987       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
1988     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
1989     return new legVecZOper();
1990   }
1991   if (legacy) {
1992     switch (ideal_reg) {
1993       case Op_VecS: return new legVecSOper();
1994       case Op_VecD: return new legVecDOper();
1995       case Op_VecX: return new legVecXOper();
1996       case Op_VecY: return new legVecYOper();
1997       case Op_VecZ: return new legVecZOper();
1998     }
1999   } else {
2000     switch (ideal_reg) {
2001       case Op_VecS: return new vecSOper();
2002       case Op_VecD: return new vecDOper();
2003       case Op_VecX: return new vecXOper();
2004       case Op_VecY: return new vecYOper();
2005       case Op_VecZ: return new vecZOper();
2006     }
2007   }
2008   ShouldNotReachHere();
2009   return NULL;
2010 }
2011 
2012 bool Matcher::is_reg2reg_move(MachNode* m) {
2013   switch (m->rule()) {
2014     case MoveVec2Leg_rule:
2015     case MoveLeg2Vec_rule:
2016     case MoveF2VL_rule:
2017     case MoveF2LEG_rule:
2018     case MoveVL2F_rule:
2019     case MoveLEG2F_rule:
2020     case MoveD2VL_rule:
2021     case MoveD2LEG_rule:
2022     case MoveVL2D_rule:
2023     case MoveLEG2D_rule:
2024       return true;
2025     default:
2026       return false;
2027   }
2028 }
2029 
2030 bool Matcher::is_generic_vector(MachOper* opnd) {
2031   switch (opnd->opcode()) {
2032     case VEC:
2033     case LEGVEC:
2034       return true;
2035     default:
2036       return false;
2037   }
2038 }
2039 
2040 //------------------------------------------------------------------------
2041 
2042 const RegMask* Matcher::predicate_reg_mask(void) {
2043   return &_VECTMASK_REG_mask;
2044 }
2045 
2046 const TypeVect* Matcher::predicate_reg_type(const Type* elemTy, int length) {
2047   return new TypeVectMask(elemTy, length);
2048 }
2049 
2050 // Max vector size in bytes. 0 if not supported.
2051 const int Matcher::vector_width_in_bytes(BasicType bt) {
2052   assert(is_java_primitive(bt), "only primitive type vectors");
2053   if (UseSSE < 2) return 0;
2054   // SSE2 supports 128bit vectors for all types.
2055   // AVX2 supports 256bit vectors for all types.
2056   // AVX2/EVEX supports 512bit vectors for all types.
2057   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
2058   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
2059   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
2060     size = (UseAVX > 2) ? 64 : 32;
2061   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
2062     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
2063   // Use flag to limit vector size.
2064   size = MIN2(size,(int)MaxVectorSize);
2065   // Minimum 2 values in vector (or 4 for bytes).
2066   switch (bt) {
2067   case T_DOUBLE:
2068   case T_LONG:
2069     if (size < 16) return 0;
2070     break;
2071   case T_FLOAT:
2072   case T_INT:
2073     if (size < 8) return 0;
2074     break;
2075   case T_BOOLEAN:
2076     if (size < 4) return 0;
2077     break;
2078   case T_CHAR:
2079     if (size < 4) return 0;
2080     break;
2081   case T_BYTE:
2082     if (size < 4) return 0;
2083     break;
2084   case T_SHORT:
2085     if (size < 4) return 0;
2086     break;
2087   default:
2088     ShouldNotReachHere();
2089   }
2090   return size;
2091 }
2092 
2093 // Limits on vector size (number of elements) loaded into vector.
2094 const int Matcher::max_vector_size(const BasicType bt) {
2095   return vector_width_in_bytes(bt)/type2aelembytes(bt);
2096 }
2097 const int Matcher::min_vector_size(const BasicType bt) {
2098   int max_size = max_vector_size(bt);
2099   // Min size which can be loaded into vector is 4 bytes.
2100   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
2101   // Support for calling svml double64 vectors
2102   if (bt == T_DOUBLE) {
2103     size = 1;
2104   }
2105   return MIN2(size,max_size);
2106 }
2107 
2108 const int Matcher::scalable_vector_reg_size(const BasicType bt) {
2109   return -1;
2110 }
2111 
2112 // Vector ideal reg corresponding to specified size in bytes
2113 const uint Matcher::vector_ideal_reg(int size) {
2114   assert(MaxVectorSize >= size, "");
2115   switch(size) {
2116     case  4: return Op_VecS;
2117     case  8: return Op_VecD;
2118     case 16: return Op_VecX;
2119     case 32: return Op_VecY;
2120     case 64: return Op_VecZ;
2121   }
2122   ShouldNotReachHere();
2123   return 0;
2124 }
2125 
2126 // Check for shift by small constant as well
2127 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
2128   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
2129       shift->in(2)->get_int() <= 3 &&
2130       // Are there other uses besides address expressions?
2131       !matcher->is_visited(shift)) {
2132     address_visited.set(shift->_idx); // Flag as address_visited
2133     mstack.push(shift->in(2), Matcher::Visit);
2134     Node *conv = shift->in(1);
2135 #ifdef _LP64
2136     // Allow Matcher to match the rule which bypass
2137     // ConvI2L operation for an array index on LP64
2138     // if the index value is positive.
2139     if (conv->Opcode() == Op_ConvI2L &&
2140         conv->as_Type()->type()->is_long()->_lo >= 0 &&
2141         // Are there other uses besides address expressions?
2142         !matcher->is_visited(conv)) {
2143       address_visited.set(conv->_idx); // Flag as address_visited
2144       mstack.push(conv->in(1), Matcher::Pre_Visit);
2145     } else
2146 #endif
2147       mstack.push(conv, Matcher::Pre_Visit);
2148     return true;
2149   }
2150   return false;
2151 }
2152 
2153 // This function identifies sub-graphs in which a 'load' node is
2154 // input to two different nodes, and such that it can be matched
2155 // with BMI instructions like blsi, blsr, etc.
2156 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
2157 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
2158 // refers to the same node.
2159 //
2160 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
2161 // This is a temporary solution until we make DAGs expressible in ADL.
2162 template<typename ConType>
2163 class FusedPatternMatcher {
2164   Node* _op1_node;
2165   Node* _mop_node;
2166   int _con_op;
2167 
2168   static int match_next(Node* n, int next_op, int next_op_idx) {
2169     if (n->in(1) == NULL || n->in(2) == NULL) {
2170       return -1;
2171     }
2172 
2173     if (next_op_idx == -1) { // n is commutative, try rotations
2174       if (n->in(1)->Opcode() == next_op) {
2175         return 1;
2176       } else if (n->in(2)->Opcode() == next_op) {
2177         return 2;
2178       }
2179     } else {
2180       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
2181       if (n->in(next_op_idx)->Opcode() == next_op) {
2182         return next_op_idx;
2183       }
2184     }
2185     return -1;
2186   }
2187 
2188  public:
2189   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
2190     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
2191 
2192   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
2193              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
2194              typename ConType::NativeType con_value) {
2195     if (_op1_node->Opcode() != op1) {
2196       return false;
2197     }
2198     if (_mop_node->outcnt() > 2) {
2199       return false;
2200     }
2201     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
2202     if (op1_op2_idx == -1) {
2203       return false;
2204     }
2205     // Memory operation must be the other edge
2206     int op1_mop_idx = (op1_op2_idx & 1) + 1;
2207 
2208     // Check that the mop node is really what we want
2209     if (_op1_node->in(op1_mop_idx) == _mop_node) {
2210       Node* op2_node = _op1_node->in(op1_op2_idx);
2211       if (op2_node->outcnt() > 1) {
2212         return false;
2213       }
2214       assert(op2_node->Opcode() == op2, "Should be");
2215       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
2216       if (op2_con_idx == -1) {
2217         return false;
2218       }
2219       // Memory operation must be the other edge
2220       int op2_mop_idx = (op2_con_idx & 1) + 1;
2221       // Check that the memory operation is the same node
2222       if (op2_node->in(op2_mop_idx) == _mop_node) {
2223         // Now check the constant
2224         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
2225         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
2226           return true;
2227         }
2228       }
2229     }
2230     return false;
2231   }
2232 };
2233 
2234 static bool is_bmi_pattern(Node* n, Node* m) {
2235   assert(UseBMI1Instructions, "sanity");
2236   if (n != NULL && m != NULL) {
2237     if (m->Opcode() == Op_LoadI) {
2238       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
2239       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
2240              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
2241              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
2242     } else if (m->Opcode() == Op_LoadL) {
2243       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
2244       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
2245              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
2246              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
2247     }
2248   }
2249   return false;
2250 }
2251 
2252 // Should the matcher clone input 'm' of node 'n'?
2253 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
2254   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
2255   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
2256     mstack.push(m, Visit);
2257     return true;
2258   }
2259   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
2260     mstack.push(m, Visit);           // m = ShiftCntV
2261     return true;
2262   }
2263   return false;
2264 }
2265 
2266 // Should the Matcher clone shifts on addressing modes, expecting them
2267 // to be subsumed into complex addressing expressions or compute them
2268 // into registers?
2269 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
2270   Node *off = m->in(AddPNode::Offset);
2271   if (off->is_Con()) {
2272     address_visited.test_set(m->_idx); // Flag as address_visited
2273     Node *adr = m->in(AddPNode::Address);
2274 
2275     // Intel can handle 2 adds in addressing mode
2276     // AtomicAdd is not an addressing expression.
2277     // Cheap to find it by looking for screwy base.
2278     if (adr->is_AddP() &&
2279         !adr->in(AddPNode::Base)->is_top() &&
2280         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
2281         // Are there other uses besides address expressions?
2282         !is_visited(adr)) {
2283       address_visited.set(adr->_idx); // Flag as address_visited
2284       Node *shift = adr->in(AddPNode::Offset);
2285       if (!clone_shift(shift, this, mstack, address_visited)) {
2286         mstack.push(shift, Pre_Visit);
2287       }
2288       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
2289       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
2290     } else {
2291       mstack.push(adr, Pre_Visit);
2292     }
2293 
2294     // Clone X+offset as it also folds into most addressing expressions
2295     mstack.push(off, Visit);
2296     mstack.push(m->in(AddPNode::Base), Pre_Visit);
2297     return true;
2298   } else if (clone_shift(off, this, mstack, address_visited)) {
2299     address_visited.test_set(m->_idx); // Flag as address_visited
2300     mstack.push(m->in(AddPNode::Address), Pre_Visit);
2301     mstack.push(m->in(AddPNode::Base), Pre_Visit);
2302     return true;
2303   }
2304   return false;
2305 }
2306 
2307 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
2308   switch (bt) {
2309     case BoolTest::eq:
2310       return Assembler::eq;
2311     case BoolTest::ne:
2312       return Assembler::neq;
2313     case BoolTest::le:
2314     case BoolTest::ule:
2315       return Assembler::le;
2316     case BoolTest::ge:
2317     case BoolTest::uge:
2318       return Assembler::nlt;
2319     case BoolTest::lt:
2320     case BoolTest::ult:
2321       return Assembler::lt;
2322     case BoolTest::gt:
2323     case BoolTest::ugt:
2324       return Assembler::nle;
2325     default : ShouldNotReachHere(); return Assembler::_false;
2326   }
2327 }
2328 
2329 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
2330   switch (bt) {
2331   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
2332   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
2333   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
2334   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
2335   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
2336   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
2337   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
2338   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
2339   }
2340 }
2341 
2342 // Helper methods for MachSpillCopyNode::implementation().
2343 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
2344                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
2345   assert(ireg == Op_VecS || // 32bit vector
2346          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
2347          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
2348          "no non-adjacent vector moves" );
2349   if (cbuf) {
2350     C2_MacroAssembler _masm(cbuf);
2351     switch (ireg) {
2352     case Op_VecS: // copy whole register
2353     case Op_VecD:
2354     case Op_VecX:
2355 #ifndef _LP64
2356       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2357 #else
2358       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2359         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2360       } else {
2361         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2362      }
2363 #endif
2364       break;
2365     case Op_VecY:
2366 #ifndef _LP64
2367       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2368 #else
2369       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2370         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2371       } else {
2372         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2373      }
2374 #endif
2375       break;
2376     case Op_VecZ:
2377       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
2378       break;
2379     default:
2380       ShouldNotReachHere();
2381     }
2382 #ifndef PRODUCT
2383   } else {
2384     switch (ireg) {
2385     case Op_VecS:
2386     case Op_VecD:
2387     case Op_VecX:
2388       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2389       break;
2390     case Op_VecY:
2391     case Op_VecZ:
2392       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2393       break;
2394     default:
2395       ShouldNotReachHere();
2396     }
2397 #endif
2398   }
2399 }
2400 
2401 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
2402                      int stack_offset, int reg, uint ireg, outputStream* st) {
2403   if (cbuf) {
2404     C2_MacroAssembler _masm(cbuf);
2405     if (is_load) {
2406       switch (ireg) {
2407       case Op_VecS:
2408         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2409         break;
2410       case Op_VecD:
2411         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2412         break;
2413       case Op_VecX:
2414 #ifndef _LP64
2415         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2416 #else
2417         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2418           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2419         } else {
2420           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2421           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2422         }
2423 #endif
2424         break;
2425       case Op_VecY:
2426 #ifndef _LP64
2427         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2428 #else
2429         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2430           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2431         } else {
2432           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2433           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2434         }
2435 #endif
2436         break;
2437       case Op_VecZ:
2438         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
2439         break;
2440       default:
2441         ShouldNotReachHere();
2442       }
2443     } else { // store
2444       switch (ireg) {
2445       case Op_VecS:
2446         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2447         break;
2448       case Op_VecD:
2449         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2450         break;
2451       case Op_VecX:
2452 #ifndef _LP64
2453         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2454 #else
2455         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2456           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2457         }
2458         else {
2459           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2460         }
2461 #endif
2462         break;
2463       case Op_VecY:
2464 #ifndef _LP64
2465         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2466 #else
2467         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2468           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2469         }
2470         else {
2471           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2472         }
2473 #endif
2474         break;
2475       case Op_VecZ:
2476         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2477         break;
2478       default:
2479         ShouldNotReachHere();
2480       }
2481     }
2482 #ifndef PRODUCT
2483   } else {
2484     if (is_load) {
2485       switch (ireg) {
2486       case Op_VecS:
2487         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2488         break;
2489       case Op_VecD:
2490         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2491         break;
2492        case Op_VecX:
2493         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2494         break;
2495       case Op_VecY:
2496       case Op_VecZ:
2497         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2498         break;
2499       default:
2500         ShouldNotReachHere();
2501       }
2502     } else { // store
2503       switch (ireg) {
2504       case Op_VecS:
2505         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2506         break;
2507       case Op_VecD:
2508         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2509         break;
2510        case Op_VecX:
2511         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2512         break;
2513       case Op_VecY:
2514       case Op_VecZ:
2515         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2516         break;
2517       default:
2518         ShouldNotReachHere();
2519       }
2520     }
2521 #endif
2522   }
2523 }
2524 
2525 static inline jlong replicate8_imm(int con, int width) {
2526   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
2527   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
2528   int bit_width = width * 8;
2529   jlong val = con;
2530   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
2531   while(bit_width < 64) {
2532     val |= (val << bit_width);
2533     bit_width <<= 1;
2534   }
2535   return val;
2536 }
2537 
2538 #ifndef PRODUCT
2539   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
2540     st->print("nop \t# %d bytes pad for loops and calls", _count);
2541   }
2542 #endif
2543 
2544   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
2545     C2_MacroAssembler _masm(&cbuf);
2546     __ nop(_count);
2547   }
2548 
2549   uint MachNopNode::size(PhaseRegAlloc*) const {
2550     return _count;
2551   }
2552 
2553 #ifndef PRODUCT
2554   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2555     st->print("# breakpoint");
2556   }
2557 #endif
2558 
2559   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
2560     C2_MacroAssembler _masm(&cbuf);
2561     __ int3();
2562   }
2563 
2564   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2565     return MachNode::size(ra_);
2566   }
2567 
2568 %}
2569 
2570 encode %{
2571 
2572   enc_class call_epilog %{

2573     if (VerifyStackAtCalls) {
2574       // Check that stack depth is unchanged: find majik cookie on stack
2575       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2576       C2_MacroAssembler _masm(&cbuf);
2577       Label L;
2578       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2579       __ jccb(Assembler::equal, L);
2580       // Die if stack mismatch
2581       __ int3();
2582       __ bind(L);
2583     }

2584   %}
2585 
2586 %}
2587 
2588 // Operands for bound floating pointer register arguments
2589 operand rxmm0() %{
2590   constraint(ALLOC_IN_RC(xmm0_reg));
2591   match(VecX);
2592   format%{%}
2593   interface(REG_INTER);
2594 %}
2595 
2596 //----------OPERANDS-----------------------------------------------------------
2597 // Operand definitions must precede instruction definitions for correct parsing
2598 // in the ADLC because operands constitute user defined types which are used in
2599 // instruction definitions.
2600 
2601 // Vectors
2602 
2603 // Dummy generic vector class. Should be used for all vector operands.
2604 // Replaced with vec[SDXYZ] during post-selection pass.
2605 operand vec() %{
2606   constraint(ALLOC_IN_RC(dynamic));
2607   match(VecX);
2608   match(VecY);
2609   match(VecZ);
2610   match(VecS);
2611   match(VecD);
2612 
2613   format %{ %}
2614   interface(REG_INTER);
2615 %}
2616 
2617 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
2618 // Replaced with legVec[SDXYZ] during post-selection cleanup.
2619 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
2620 // runtime code generation via reg_class_dynamic.
2621 operand legVec() %{
2622   constraint(ALLOC_IN_RC(dynamic));
2623   match(VecX);
2624   match(VecY);
2625   match(VecZ);
2626   match(VecS);
2627   match(VecD);
2628 
2629   format %{ %}
2630   interface(REG_INTER);
2631 %}
2632 
2633 // Replaces vec during post-selection cleanup. See above.
2634 operand vecS() %{
2635   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
2636   match(VecS);
2637 
2638   format %{ %}
2639   interface(REG_INTER);
2640 %}
2641 
2642 // Replaces legVec during post-selection cleanup. See above.
2643 operand legVecS() %{
2644   constraint(ALLOC_IN_RC(vectors_reg_legacy));
2645   match(VecS);
2646 
2647   format %{ %}
2648   interface(REG_INTER);
2649 %}
2650 
2651 // Replaces vec during post-selection cleanup. See above.
2652 operand vecD() %{
2653   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
2654   match(VecD);
2655 
2656   format %{ %}
2657   interface(REG_INTER);
2658 %}
2659 
2660 // Replaces legVec during post-selection cleanup. See above.
2661 operand legVecD() %{
2662   constraint(ALLOC_IN_RC(vectord_reg_legacy));
2663   match(VecD);
2664 
2665   format %{ %}
2666   interface(REG_INTER);
2667 %}
2668 
2669 // Replaces vec during post-selection cleanup. See above.
2670 operand vecX() %{
2671   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
2672   match(VecX);
2673 
2674   format %{ %}
2675   interface(REG_INTER);
2676 %}
2677 
2678 // Replaces legVec during post-selection cleanup. See above.
2679 operand legVecX() %{
2680   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
2681   match(VecX);
2682 
2683   format %{ %}
2684   interface(REG_INTER);
2685 %}
2686 
2687 // Replaces vec during post-selection cleanup. See above.
2688 operand vecY() %{
2689   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
2690   match(VecY);
2691 
2692   format %{ %}
2693   interface(REG_INTER);
2694 %}
2695 
2696 // Replaces legVec during post-selection cleanup. See above.
2697 operand legVecY() %{
2698   constraint(ALLOC_IN_RC(vectory_reg_legacy));
2699   match(VecY);
2700 
2701   format %{ %}
2702   interface(REG_INTER);
2703 %}
2704 
2705 // Replaces vec during post-selection cleanup. See above.
2706 operand vecZ() %{
2707   constraint(ALLOC_IN_RC(vectorz_reg));
2708   match(VecZ);
2709 
2710   format %{ %}
2711   interface(REG_INTER);
2712 %}
2713 
2714 // Replaces legVec during post-selection cleanup. See above.
2715 operand legVecZ() %{
2716   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
2717   match(VecZ);
2718 
2719   format %{ %}
2720   interface(REG_INTER);
2721 %}
2722 
2723 // Comparison Code for FP conditional move
2724 operand cmpOp_vcmppd() %{
2725   match(Bool);
2726 
2727   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2728             n->as_Bool()->_test._test != BoolTest::no_overflow);
2729   format %{ "" %}
2730   interface(COND_INTER) %{
2731     equal        (0x0, "eq");
2732     less         (0x1, "lt");
2733     less_equal   (0x2, "le");
2734     not_equal    (0xC, "ne");
2735     greater_equal(0xD, "ge");
2736     greater      (0xE, "gt");
2737     //TODO cannot compile (adlc breaks) without two next lines with error:
2738     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2739     // equal' for overflow.
2740     overflow     (0x20, "o");  // not really supported by the instruction
2741     no_overflow  (0x21, "no"); // not really supported by the instruction
2742   %}
2743 %}
2744 
2745 
2746 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2747 
2748 // ============================================================================
2749 
2750 instruct ShouldNotReachHere() %{
2751   match(Halt);
2752   format %{ "stop\t# ShouldNotReachHere" %}
2753   ins_encode %{
2754     if (is_reachable()) {
2755       __ stop(_halt_reason);
2756     }
2757   %}
2758   ins_pipe(pipe_slow);
2759 %}
2760 
2761 // =================================EVEX special===============================
2762 // Existing partial implementation for post-loop multi-versioning computes
2763 // the mask corresponding to tail loop in K1 opmask register. This may then be
2764 // used for predicating instructions in loop body during last post-loop iteration.
2765 // TODO: Remove hard-coded K1 usage while fixing existing post-loop
2766 // multiversioning support.
2767 instruct setMask(rRegI dst, rRegI src, kReg_K1 mask) %{
2768   predicate(PostLoopMultiversioning && Matcher::has_predicated_vectors());
2769   match(Set dst (SetVectMaskI  src));
2770   effect(TEMP dst);
2771   format %{ "setvectmask   $dst, $src" %}
2772   ins_encode %{
2773     __ setvectmask($dst$$Register, $src$$Register, $mask$$KRegister);
2774   %}
2775   ins_pipe(pipe_slow);
2776 %}
2777 
2778 // ============================================================================
2779 
2780 instruct addF_reg(regF dst, regF src) %{
2781   predicate((UseSSE>=1) && (UseAVX == 0));
2782   match(Set dst (AddF dst src));
2783 
2784   format %{ "addss   $dst, $src" %}
2785   ins_cost(150);
2786   ins_encode %{
2787     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2788   %}
2789   ins_pipe(pipe_slow);
2790 %}
2791 
2792 instruct addF_mem(regF dst, memory src) %{
2793   predicate((UseSSE>=1) && (UseAVX == 0));
2794   match(Set dst (AddF dst (LoadF src)));
2795 
2796   format %{ "addss   $dst, $src" %}
2797   ins_cost(150);
2798   ins_encode %{
2799     __ addss($dst$$XMMRegister, $src$$Address);
2800   %}
2801   ins_pipe(pipe_slow);
2802 %}
2803 
2804 instruct addF_imm(regF dst, immF con) %{
2805   predicate((UseSSE>=1) && (UseAVX == 0));
2806   match(Set dst (AddF dst con));
2807   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2808   ins_cost(150);
2809   ins_encode %{
2810     __ addss($dst$$XMMRegister, $constantaddress($con));
2811   %}
2812   ins_pipe(pipe_slow);
2813 %}
2814 
2815 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2816   predicate(UseAVX > 0);
2817   match(Set dst (AddF src1 src2));
2818 
2819   format %{ "vaddss  $dst, $src1, $src2" %}
2820   ins_cost(150);
2821   ins_encode %{
2822     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2823   %}
2824   ins_pipe(pipe_slow);
2825 %}
2826 
2827 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2828   predicate(UseAVX > 0);
2829   match(Set dst (AddF src1 (LoadF src2)));
2830 
2831   format %{ "vaddss  $dst, $src1, $src2" %}
2832   ins_cost(150);
2833   ins_encode %{
2834     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2835   %}
2836   ins_pipe(pipe_slow);
2837 %}
2838 
2839 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2840   predicate(UseAVX > 0);
2841   match(Set dst (AddF src con));
2842 
2843   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2844   ins_cost(150);
2845   ins_encode %{
2846     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2847   %}
2848   ins_pipe(pipe_slow);
2849 %}
2850 
2851 instruct addD_reg(regD dst, regD src) %{
2852   predicate((UseSSE>=2) && (UseAVX == 0));
2853   match(Set dst (AddD dst src));
2854 
2855   format %{ "addsd   $dst, $src" %}
2856   ins_cost(150);
2857   ins_encode %{
2858     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2859   %}
2860   ins_pipe(pipe_slow);
2861 %}
2862 
2863 instruct addD_mem(regD dst, memory src) %{
2864   predicate((UseSSE>=2) && (UseAVX == 0));
2865   match(Set dst (AddD dst (LoadD src)));
2866 
2867   format %{ "addsd   $dst, $src" %}
2868   ins_cost(150);
2869   ins_encode %{
2870     __ addsd($dst$$XMMRegister, $src$$Address);
2871   %}
2872   ins_pipe(pipe_slow);
2873 %}
2874 
2875 instruct addD_imm(regD dst, immD con) %{
2876   predicate((UseSSE>=2) && (UseAVX == 0));
2877   match(Set dst (AddD dst con));
2878   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2879   ins_cost(150);
2880   ins_encode %{
2881     __ addsd($dst$$XMMRegister, $constantaddress($con));
2882   %}
2883   ins_pipe(pipe_slow);
2884 %}
2885 
2886 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2887   predicate(UseAVX > 0);
2888   match(Set dst (AddD src1 src2));
2889 
2890   format %{ "vaddsd  $dst, $src1, $src2" %}
2891   ins_cost(150);
2892   ins_encode %{
2893     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2894   %}
2895   ins_pipe(pipe_slow);
2896 %}
2897 
2898 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2899   predicate(UseAVX > 0);
2900   match(Set dst (AddD src1 (LoadD src2)));
2901 
2902   format %{ "vaddsd  $dst, $src1, $src2" %}
2903   ins_cost(150);
2904   ins_encode %{
2905     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2906   %}
2907   ins_pipe(pipe_slow);
2908 %}
2909 
2910 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2911   predicate(UseAVX > 0);
2912   match(Set dst (AddD src con));
2913 
2914   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2915   ins_cost(150);
2916   ins_encode %{
2917     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2918   %}
2919   ins_pipe(pipe_slow);
2920 %}
2921 
2922 instruct subF_reg(regF dst, regF src) %{
2923   predicate((UseSSE>=1) && (UseAVX == 0));
2924   match(Set dst (SubF dst src));
2925 
2926   format %{ "subss   $dst, $src" %}
2927   ins_cost(150);
2928   ins_encode %{
2929     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2930   %}
2931   ins_pipe(pipe_slow);
2932 %}
2933 
2934 instruct subF_mem(regF dst, memory src) %{
2935   predicate((UseSSE>=1) && (UseAVX == 0));
2936   match(Set dst (SubF dst (LoadF src)));
2937 
2938   format %{ "subss   $dst, $src" %}
2939   ins_cost(150);
2940   ins_encode %{
2941     __ subss($dst$$XMMRegister, $src$$Address);
2942   %}
2943   ins_pipe(pipe_slow);
2944 %}
2945 
2946 instruct subF_imm(regF dst, immF con) %{
2947   predicate((UseSSE>=1) && (UseAVX == 0));
2948   match(Set dst (SubF dst con));
2949   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2950   ins_cost(150);
2951   ins_encode %{
2952     __ subss($dst$$XMMRegister, $constantaddress($con));
2953   %}
2954   ins_pipe(pipe_slow);
2955 %}
2956 
2957 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2958   predicate(UseAVX > 0);
2959   match(Set dst (SubF src1 src2));
2960 
2961   format %{ "vsubss  $dst, $src1, $src2" %}
2962   ins_cost(150);
2963   ins_encode %{
2964     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2965   %}
2966   ins_pipe(pipe_slow);
2967 %}
2968 
2969 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2970   predicate(UseAVX > 0);
2971   match(Set dst (SubF src1 (LoadF src2)));
2972 
2973   format %{ "vsubss  $dst, $src1, $src2" %}
2974   ins_cost(150);
2975   ins_encode %{
2976     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2977   %}
2978   ins_pipe(pipe_slow);
2979 %}
2980 
2981 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2982   predicate(UseAVX > 0);
2983   match(Set dst (SubF src con));
2984 
2985   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2986   ins_cost(150);
2987   ins_encode %{
2988     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2989   %}
2990   ins_pipe(pipe_slow);
2991 %}
2992 
2993 instruct subD_reg(regD dst, regD src) %{
2994   predicate((UseSSE>=2) && (UseAVX == 0));
2995   match(Set dst (SubD dst src));
2996 
2997   format %{ "subsd   $dst, $src" %}
2998   ins_cost(150);
2999   ins_encode %{
3000     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
3001   %}
3002   ins_pipe(pipe_slow);
3003 %}
3004 
3005 instruct subD_mem(regD dst, memory src) %{
3006   predicate((UseSSE>=2) && (UseAVX == 0));
3007   match(Set dst (SubD dst (LoadD src)));
3008 
3009   format %{ "subsd   $dst, $src" %}
3010   ins_cost(150);
3011   ins_encode %{
3012     __ subsd($dst$$XMMRegister, $src$$Address);
3013   %}
3014   ins_pipe(pipe_slow);
3015 %}
3016 
3017 instruct subD_imm(regD dst, immD con) %{
3018   predicate((UseSSE>=2) && (UseAVX == 0));
3019   match(Set dst (SubD dst con));
3020   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3021   ins_cost(150);
3022   ins_encode %{
3023     __ subsd($dst$$XMMRegister, $constantaddress($con));
3024   %}
3025   ins_pipe(pipe_slow);
3026 %}
3027 
3028 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
3029   predicate(UseAVX > 0);
3030   match(Set dst (SubD src1 src2));
3031 
3032   format %{ "vsubsd  $dst, $src1, $src2" %}
3033   ins_cost(150);
3034   ins_encode %{
3035     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3036   %}
3037   ins_pipe(pipe_slow);
3038 %}
3039 
3040 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
3041   predicate(UseAVX > 0);
3042   match(Set dst (SubD src1 (LoadD src2)));
3043 
3044   format %{ "vsubsd  $dst, $src1, $src2" %}
3045   ins_cost(150);
3046   ins_encode %{
3047     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3048   %}
3049   ins_pipe(pipe_slow);
3050 %}
3051 
3052 instruct subD_reg_imm(regD dst, regD src, immD con) %{
3053   predicate(UseAVX > 0);
3054   match(Set dst (SubD src con));
3055 
3056   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3057   ins_cost(150);
3058   ins_encode %{
3059     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3060   %}
3061   ins_pipe(pipe_slow);
3062 %}
3063 
3064 instruct mulF_reg(regF dst, regF src) %{
3065   predicate((UseSSE>=1) && (UseAVX == 0));
3066   match(Set dst (MulF dst src));
3067 
3068   format %{ "mulss   $dst, $src" %}
3069   ins_cost(150);
3070   ins_encode %{
3071     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
3072   %}
3073   ins_pipe(pipe_slow);
3074 %}
3075 
3076 instruct mulF_mem(regF dst, memory src) %{
3077   predicate((UseSSE>=1) && (UseAVX == 0));
3078   match(Set dst (MulF dst (LoadF src)));
3079 
3080   format %{ "mulss   $dst, $src" %}
3081   ins_cost(150);
3082   ins_encode %{
3083     __ mulss($dst$$XMMRegister, $src$$Address);
3084   %}
3085   ins_pipe(pipe_slow);
3086 %}
3087 
3088 instruct mulF_imm(regF dst, immF con) %{
3089   predicate((UseSSE>=1) && (UseAVX == 0));
3090   match(Set dst (MulF dst con));
3091   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3092   ins_cost(150);
3093   ins_encode %{
3094     __ mulss($dst$$XMMRegister, $constantaddress($con));
3095   %}
3096   ins_pipe(pipe_slow);
3097 %}
3098 
3099 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
3100   predicate(UseAVX > 0);
3101   match(Set dst (MulF src1 src2));
3102 
3103   format %{ "vmulss  $dst, $src1, $src2" %}
3104   ins_cost(150);
3105   ins_encode %{
3106     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3107   %}
3108   ins_pipe(pipe_slow);
3109 %}
3110 
3111 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
3112   predicate(UseAVX > 0);
3113   match(Set dst (MulF src1 (LoadF src2)));
3114 
3115   format %{ "vmulss  $dst, $src1, $src2" %}
3116   ins_cost(150);
3117   ins_encode %{
3118     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3119   %}
3120   ins_pipe(pipe_slow);
3121 %}
3122 
3123 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
3124   predicate(UseAVX > 0);
3125   match(Set dst (MulF src con));
3126 
3127   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3128   ins_cost(150);
3129   ins_encode %{
3130     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3131   %}
3132   ins_pipe(pipe_slow);
3133 %}
3134 
3135 instruct mulD_reg(regD dst, regD src) %{
3136   predicate((UseSSE>=2) && (UseAVX == 0));
3137   match(Set dst (MulD dst src));
3138 
3139   format %{ "mulsd   $dst, $src" %}
3140   ins_cost(150);
3141   ins_encode %{
3142     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
3143   %}
3144   ins_pipe(pipe_slow);
3145 %}
3146 
3147 instruct mulD_mem(regD dst, memory src) %{
3148   predicate((UseSSE>=2) && (UseAVX == 0));
3149   match(Set dst (MulD dst (LoadD src)));
3150 
3151   format %{ "mulsd   $dst, $src" %}
3152   ins_cost(150);
3153   ins_encode %{
3154     __ mulsd($dst$$XMMRegister, $src$$Address);
3155   %}
3156   ins_pipe(pipe_slow);
3157 %}
3158 
3159 instruct mulD_imm(regD dst, immD con) %{
3160   predicate((UseSSE>=2) && (UseAVX == 0));
3161   match(Set dst (MulD dst con));
3162   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3163   ins_cost(150);
3164   ins_encode %{
3165     __ mulsd($dst$$XMMRegister, $constantaddress($con));
3166   %}
3167   ins_pipe(pipe_slow);
3168 %}
3169 
3170 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
3171   predicate(UseAVX > 0);
3172   match(Set dst (MulD src1 src2));
3173 
3174   format %{ "vmulsd  $dst, $src1, $src2" %}
3175   ins_cost(150);
3176   ins_encode %{
3177     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3178   %}
3179   ins_pipe(pipe_slow);
3180 %}
3181 
3182 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
3183   predicate(UseAVX > 0);
3184   match(Set dst (MulD src1 (LoadD src2)));
3185 
3186   format %{ "vmulsd  $dst, $src1, $src2" %}
3187   ins_cost(150);
3188   ins_encode %{
3189     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3190   %}
3191   ins_pipe(pipe_slow);
3192 %}
3193 
3194 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
3195   predicate(UseAVX > 0);
3196   match(Set dst (MulD src con));
3197 
3198   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3199   ins_cost(150);
3200   ins_encode %{
3201     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3202   %}
3203   ins_pipe(pipe_slow);
3204 %}
3205 
3206 instruct divF_reg(regF dst, regF src) %{
3207   predicate((UseSSE>=1) && (UseAVX == 0));
3208   match(Set dst (DivF dst src));
3209 
3210   format %{ "divss   $dst, $src" %}
3211   ins_cost(150);
3212   ins_encode %{
3213     __ divss($dst$$XMMRegister, $src$$XMMRegister);
3214   %}
3215   ins_pipe(pipe_slow);
3216 %}
3217 
3218 instruct divF_mem(regF dst, memory src) %{
3219   predicate((UseSSE>=1) && (UseAVX == 0));
3220   match(Set dst (DivF dst (LoadF src)));
3221 
3222   format %{ "divss   $dst, $src" %}
3223   ins_cost(150);
3224   ins_encode %{
3225     __ divss($dst$$XMMRegister, $src$$Address);
3226   %}
3227   ins_pipe(pipe_slow);
3228 %}
3229 
3230 instruct divF_imm(regF dst, immF con) %{
3231   predicate((UseSSE>=1) && (UseAVX == 0));
3232   match(Set dst (DivF dst con));
3233   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3234   ins_cost(150);
3235   ins_encode %{
3236     __ divss($dst$$XMMRegister, $constantaddress($con));
3237   %}
3238   ins_pipe(pipe_slow);
3239 %}
3240 
3241 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
3242   predicate(UseAVX > 0);
3243   match(Set dst (DivF src1 src2));
3244 
3245   format %{ "vdivss  $dst, $src1, $src2" %}
3246   ins_cost(150);
3247   ins_encode %{
3248     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3249   %}
3250   ins_pipe(pipe_slow);
3251 %}
3252 
3253 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
3254   predicate(UseAVX > 0);
3255   match(Set dst (DivF src1 (LoadF src2)));
3256 
3257   format %{ "vdivss  $dst, $src1, $src2" %}
3258   ins_cost(150);
3259   ins_encode %{
3260     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3261   %}
3262   ins_pipe(pipe_slow);
3263 %}
3264 
3265 instruct divF_reg_imm(regF dst, regF src, immF con) %{
3266   predicate(UseAVX > 0);
3267   match(Set dst (DivF src con));
3268 
3269   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3270   ins_cost(150);
3271   ins_encode %{
3272     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3273   %}
3274   ins_pipe(pipe_slow);
3275 %}
3276 
3277 instruct divD_reg(regD dst, regD src) %{
3278   predicate((UseSSE>=2) && (UseAVX == 0));
3279   match(Set dst (DivD dst src));
3280 
3281   format %{ "divsd   $dst, $src" %}
3282   ins_cost(150);
3283   ins_encode %{
3284     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
3285   %}
3286   ins_pipe(pipe_slow);
3287 %}
3288 
3289 instruct divD_mem(regD dst, memory src) %{
3290   predicate((UseSSE>=2) && (UseAVX == 0));
3291   match(Set dst (DivD dst (LoadD src)));
3292 
3293   format %{ "divsd   $dst, $src" %}
3294   ins_cost(150);
3295   ins_encode %{
3296     __ divsd($dst$$XMMRegister, $src$$Address);
3297   %}
3298   ins_pipe(pipe_slow);
3299 %}
3300 
3301 instruct divD_imm(regD dst, immD con) %{
3302   predicate((UseSSE>=2) && (UseAVX == 0));
3303   match(Set dst (DivD dst con));
3304   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3305   ins_cost(150);
3306   ins_encode %{
3307     __ divsd($dst$$XMMRegister, $constantaddress($con));
3308   %}
3309   ins_pipe(pipe_slow);
3310 %}
3311 
3312 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
3313   predicate(UseAVX > 0);
3314   match(Set dst (DivD src1 src2));
3315 
3316   format %{ "vdivsd  $dst, $src1, $src2" %}
3317   ins_cost(150);
3318   ins_encode %{
3319     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3320   %}
3321   ins_pipe(pipe_slow);
3322 %}
3323 
3324 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
3325   predicate(UseAVX > 0);
3326   match(Set dst (DivD src1 (LoadD src2)));
3327 
3328   format %{ "vdivsd  $dst, $src1, $src2" %}
3329   ins_cost(150);
3330   ins_encode %{
3331     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3332   %}
3333   ins_pipe(pipe_slow);
3334 %}
3335 
3336 instruct divD_reg_imm(regD dst, regD src, immD con) %{
3337   predicate(UseAVX > 0);
3338   match(Set dst (DivD src con));
3339 
3340   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3341   ins_cost(150);
3342   ins_encode %{
3343     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3344   %}
3345   ins_pipe(pipe_slow);
3346 %}
3347 
3348 instruct absF_reg(regF dst) %{
3349   predicate((UseSSE>=1) && (UseAVX == 0));
3350   match(Set dst (AbsF dst));
3351   ins_cost(150);
3352   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
3353   ins_encode %{
3354     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
3355   %}
3356   ins_pipe(pipe_slow);
3357 %}
3358 
3359 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
3360   predicate(UseAVX > 0);
3361   match(Set dst (AbsF src));
3362   ins_cost(150);
3363   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
3364   ins_encode %{
3365     int vlen_enc = Assembler::AVX_128bit;
3366     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
3367               ExternalAddress(float_signmask()), vlen_enc);
3368   %}
3369   ins_pipe(pipe_slow);
3370 %}
3371 
3372 instruct absD_reg(regD dst) %{
3373   predicate((UseSSE>=2) && (UseAVX == 0));
3374   match(Set dst (AbsD dst));
3375   ins_cost(150);
3376   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
3377             "# abs double by sign masking" %}
3378   ins_encode %{
3379     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
3380   %}
3381   ins_pipe(pipe_slow);
3382 %}
3383 
3384 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
3385   predicate(UseAVX > 0);
3386   match(Set dst (AbsD src));
3387   ins_cost(150);
3388   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
3389             "# abs double by sign masking" %}
3390   ins_encode %{
3391     int vlen_enc = Assembler::AVX_128bit;
3392     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
3393               ExternalAddress(double_signmask()), vlen_enc);
3394   %}
3395   ins_pipe(pipe_slow);
3396 %}
3397 
3398 instruct negF_reg(regF dst) %{
3399   predicate((UseSSE>=1) && (UseAVX == 0));
3400   match(Set dst (NegF dst));
3401   ins_cost(150);
3402   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
3403   ins_encode %{
3404     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
3405   %}
3406   ins_pipe(pipe_slow);
3407 %}
3408 
3409 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
3410   predicate(UseAVX > 0);
3411   match(Set dst (NegF src));
3412   ins_cost(150);
3413   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
3414   ins_encode %{
3415     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
3416                  ExternalAddress(float_signflip()));
3417   %}
3418   ins_pipe(pipe_slow);
3419 %}
3420 
3421 instruct negD_reg(regD dst) %{
3422   predicate((UseSSE>=2) && (UseAVX == 0));
3423   match(Set dst (NegD dst));
3424   ins_cost(150);
3425   format %{ "xorpd   $dst, [0x8000000000000000]\t"
3426             "# neg double by sign flipping" %}
3427   ins_encode %{
3428     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
3429   %}
3430   ins_pipe(pipe_slow);
3431 %}
3432 
3433 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
3434   predicate(UseAVX > 0);
3435   match(Set dst (NegD src));
3436   ins_cost(150);
3437   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
3438             "# neg double by sign flipping" %}
3439   ins_encode %{
3440     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
3441                  ExternalAddress(double_signflip()));
3442   %}
3443   ins_pipe(pipe_slow);
3444 %}
3445 
3446 // sqrtss instruction needs destination register to be pre initialized for best performance
3447 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3448 instruct sqrtF_reg(regF dst) %{
3449   predicate(UseSSE>=1);
3450   match(Set dst (SqrtF dst));
3451   format %{ "sqrtss  $dst, $dst" %}
3452   ins_encode %{
3453     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
3454   %}
3455   ins_pipe(pipe_slow);
3456 %}
3457 
3458 // sqrtsd instruction needs destination register to be pre initialized for best performance
3459 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3460 instruct sqrtD_reg(regD dst) %{
3461   predicate(UseSSE>=2);
3462   match(Set dst (SqrtD dst));
3463   format %{ "sqrtsd  $dst, $dst" %}
3464   ins_encode %{
3465     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
3466   %}
3467   ins_pipe(pipe_slow);
3468 %}
3469 
3470 
3471 // ---------------------------------------- VectorReinterpret ------------------------------------
3472 instruct reinterpret_mask(kReg dst) %{
3473   predicate(n->bottom_type()->isa_vectmask() &&
3474             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
3475   match(Set dst (VectorReinterpret dst));
3476   ins_cost(125);
3477   format %{ "vector_reinterpret $dst\t!" %}
3478   ins_encode %{
3479     // empty
3480   %}
3481   ins_pipe( pipe_slow );
3482 %}
3483 
3484 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
3485   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3486             n->bottom_type()->isa_vectmask() &&
3487             n->in(1)->bottom_type()->isa_vectmask() &&
3488             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
3489             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3490   match(Set dst (VectorReinterpret src));
3491   effect(TEMP xtmp);
3492   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
3493   ins_encode %{
3494      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
3495      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3496      assert(src_sz == dst_sz , "src and dst size mismatch");
3497      int vlen_enc = vector_length_encoding(src_sz);
3498      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3499      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3500   %}
3501   ins_pipe( pipe_slow );
3502 %}
3503 
3504 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
3505   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3506             n->bottom_type()->isa_vectmask() &&
3507             n->in(1)->bottom_type()->isa_vectmask() &&
3508             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
3509              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
3510             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3511   match(Set dst (VectorReinterpret src));
3512   effect(TEMP xtmp);
3513   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
3514   ins_encode %{
3515      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
3516      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3517      assert(src_sz == dst_sz , "src and dst size mismatch");
3518      int vlen_enc = vector_length_encoding(src_sz);
3519      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3520      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3521   %}
3522   ins_pipe( pipe_slow );
3523 %}
3524 
3525 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
3526   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3527             n->bottom_type()->isa_vectmask() &&
3528             n->in(1)->bottom_type()->isa_vectmask() &&
3529             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
3530              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
3531             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3532   match(Set dst (VectorReinterpret src));
3533   effect(TEMP xtmp);
3534   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
3535   ins_encode %{
3536      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
3537      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3538      assert(src_sz == dst_sz , "src and dst size mismatch");
3539      int vlen_enc = vector_length_encoding(src_sz);
3540      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3541      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3542   %}
3543   ins_pipe( pipe_slow );
3544 %}
3545 
3546 instruct reinterpret(vec dst) %{
3547   predicate(!n->bottom_type()->isa_vectmask() &&
3548             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
3549   match(Set dst (VectorReinterpret dst));
3550   ins_cost(125);
3551   format %{ "vector_reinterpret $dst\t!" %}
3552   ins_encode %{
3553     // empty
3554   %}
3555   ins_pipe( pipe_slow );
3556 %}
3557 
3558 instruct reinterpret_expand(vec dst, vec src, rRegP scratch) %{
3559   predicate(UseAVX == 0 &&
3560             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3561   match(Set dst (VectorReinterpret src));
3562   ins_cost(125);
3563   effect(TEMP dst, TEMP scratch);
3564   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3565   ins_encode %{
3566     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
3567     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
3568 
3569     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
3570     if (src_vlen_in_bytes == 4) {
3571       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3572     } else {
3573       assert(src_vlen_in_bytes == 8, "");
3574       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), $scratch$$Register);
3575     }
3576     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3577   %}
3578   ins_pipe( pipe_slow );
3579 %}
3580 
3581 instruct vreinterpret_expand4(legVec dst, vec src, rRegP scratch) %{
3582   predicate(UseAVX > 0 &&
3583             !n->bottom_type()->isa_vectmask() &&
3584             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
3585             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3586   match(Set dst (VectorReinterpret src));
3587   ins_cost(125);
3588   effect(TEMP scratch);
3589   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3590   ins_encode %{
3591     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, $scratch$$Register);
3592   %}
3593   ins_pipe( pipe_slow );
3594 %}
3595 
3596 
3597 instruct vreinterpret_expand(legVec dst, vec src) %{
3598   predicate(UseAVX > 0 &&
3599             !n->bottom_type()->isa_vectmask() &&
3600             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
3601             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3602   match(Set dst (VectorReinterpret src));
3603   ins_cost(125);
3604   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
3605   ins_encode %{
3606     switch (Matcher::vector_length_in_bytes(this, $src)) {
3607       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3608       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3609       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3610       default: ShouldNotReachHere();
3611     }
3612   %}
3613   ins_pipe( pipe_slow );
3614 %}
3615 
3616 instruct reinterpret_shrink(vec dst, legVec src) %{
3617   predicate(!n->bottom_type()->isa_vectmask() &&
3618             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
3619   match(Set dst (VectorReinterpret src));
3620   ins_cost(125);
3621   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
3622   ins_encode %{
3623     switch (Matcher::vector_length_in_bytes(this)) {
3624       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
3625       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3626       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3627       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3628       default: ShouldNotReachHere();
3629     }
3630   %}
3631   ins_pipe( pipe_slow );
3632 %}
3633 
3634 // ----------------------------------------------------------------------------------------------------
3635 
3636 #ifdef _LP64
3637 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
3638   match(Set dst (RoundDoubleMode src rmode));
3639   format %{ "roundsd $dst,$src" %}
3640   ins_cost(150);
3641   ins_encode %{
3642     assert(UseSSE >= 4, "required");
3643     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
3644   %}
3645   ins_pipe(pipe_slow);
3646 %}
3647 
3648 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
3649   match(Set dst (RoundDoubleMode (LoadD src) rmode));
3650   format %{ "roundsd $dst,$src" %}
3651   ins_cost(150);
3652   ins_encode %{
3653     assert(UseSSE >= 4, "required");
3654     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
3655   %}
3656   ins_pipe(pipe_slow);
3657 %}
3658 
3659 instruct roundD_imm(legRegD dst, immD con, immU8 rmode, rRegI scratch_reg) %{
3660   match(Set dst (RoundDoubleMode con rmode));
3661   effect(TEMP scratch_reg);
3662   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
3663   ins_cost(150);
3664   ins_encode %{
3665     assert(UseSSE >= 4, "required");
3666     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, $scratch_reg$$Register);
3667   %}
3668   ins_pipe(pipe_slow);
3669 %}
3670 
3671 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
3672   predicate(Matcher::vector_length(n) < 8);
3673   match(Set dst (RoundDoubleModeV src rmode));
3674   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
3675   ins_encode %{
3676     assert(UseAVX > 0, "required");
3677     int vlen_enc = vector_length_encoding(this);
3678     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
3679   %}
3680   ins_pipe( pipe_slow );
3681 %}
3682 
3683 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
3684   predicate(Matcher::vector_length(n) == 8);
3685   match(Set dst (RoundDoubleModeV src rmode));
3686   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
3687   ins_encode %{
3688     assert(UseAVX > 2, "required");
3689     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
3690   %}
3691   ins_pipe( pipe_slow );
3692 %}
3693 
3694 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
3695   predicate(Matcher::vector_length(n) < 8);
3696   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3697   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
3698   ins_encode %{
3699     assert(UseAVX > 0, "required");
3700     int vlen_enc = vector_length_encoding(this);
3701     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
3702   %}
3703   ins_pipe( pipe_slow );
3704 %}
3705 
3706 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
3707   predicate(Matcher::vector_length(n) == 8);
3708   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3709   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
3710   ins_encode %{
3711     assert(UseAVX > 2, "required");
3712     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
3713   %}
3714   ins_pipe( pipe_slow );
3715 %}
3716 #endif // _LP64
3717 
3718 instruct onspinwait() %{
3719   match(OnSpinWait);
3720   ins_cost(200);
3721 
3722   format %{
3723     $$template
3724     $$emit$$"pause\t! membar_onspinwait"
3725   %}
3726   ins_encode %{
3727     __ pause();
3728   %}
3729   ins_pipe(pipe_slow);
3730 %}
3731 
3732 // a * b + c
3733 instruct fmaD_reg(regD a, regD b, regD c) %{
3734   predicate(UseFMA);
3735   match(Set c (FmaD  c (Binary a b)));
3736   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
3737   ins_cost(150);
3738   ins_encode %{
3739     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3740   %}
3741   ins_pipe( pipe_slow );
3742 %}
3743 
3744 // a * b + c
3745 instruct fmaF_reg(regF a, regF b, regF c) %{
3746   predicate(UseFMA);
3747   match(Set c (FmaF  c (Binary a b)));
3748   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
3749   ins_cost(150);
3750   ins_encode %{
3751     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3752   %}
3753   ins_pipe( pipe_slow );
3754 %}
3755 
3756 // ====================VECTOR INSTRUCTIONS=====================================
3757 
3758 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
3759 instruct MoveVec2Leg(legVec dst, vec src) %{
3760   match(Set dst src);
3761   format %{ "" %}
3762   ins_encode %{
3763     ShouldNotReachHere();
3764   %}
3765   ins_pipe( fpu_reg_reg );
3766 %}
3767 
3768 instruct MoveLeg2Vec(vec dst, legVec src) %{
3769   match(Set dst src);
3770   format %{ "" %}
3771   ins_encode %{
3772     ShouldNotReachHere();
3773   %}
3774   ins_pipe( fpu_reg_reg );
3775 %}
3776 
3777 // ============================================================================
3778 
3779 // Load vectors generic operand pattern
3780 instruct loadV(vec dst, memory mem) %{
3781   match(Set dst (LoadVector mem));
3782   ins_cost(125);
3783   format %{ "load_vector $dst,$mem" %}
3784   ins_encode %{
3785     switch (Matcher::vector_length_in_bytes(this)) {
3786       case  4: __ movdl    ($dst$$XMMRegister, $mem$$Address); break;
3787       case  8: __ movq     ($dst$$XMMRegister, $mem$$Address); break;
3788       case 16: __ movdqu   ($dst$$XMMRegister, $mem$$Address); break;
3789       case 32: __ vmovdqu  ($dst$$XMMRegister, $mem$$Address); break;
3790       case 64: __ evmovdqul($dst$$XMMRegister, $mem$$Address, Assembler::AVX_512bit); break;
3791       default: ShouldNotReachHere();
3792     }
3793   %}
3794   ins_pipe( pipe_slow );
3795 %}
3796 
3797 // Store vectors generic operand pattern.
3798 instruct storeV(memory mem, vec src) %{
3799   match(Set mem (StoreVector mem src));
3800   ins_cost(145);
3801   format %{ "store_vector $mem,$src\n\t" %}
3802   ins_encode %{
3803     switch (Matcher::vector_length_in_bytes(this, $src)) {
3804       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
3805       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
3806       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
3807       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
3808       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
3809       default: ShouldNotReachHere();
3810     }
3811   %}
3812   ins_pipe( pipe_slow );
3813 %}
3814 
3815 // ---------------------------------------- Gather ------------------------------------
3816 
3817 // Gather INT, LONG, FLOAT, DOUBLE
3818 
3819 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
3820   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
3821   match(Set dst (LoadVectorGather mem idx));
3822   effect(TEMP dst, TEMP tmp, TEMP mask);
3823   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
3824   ins_encode %{
3825     assert(UseAVX >= 2, "sanity");
3826 
3827     int vlen_enc = vector_length_encoding(this);
3828     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3829 
3830     assert(Matcher::vector_length_in_bytes(this) >= 16, "sanity");
3831     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3832 
3833     if (vlen_enc == Assembler::AVX_128bit) {
3834       __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3835     } else {
3836       __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3837     }
3838     __ lea($tmp$$Register, $mem$$Address);
3839     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
3840   %}
3841   ins_pipe( pipe_slow );
3842 %}
3843 
3844 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
3845   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
3846   match(Set dst (LoadVectorGather mem idx));
3847   effect(TEMP dst, TEMP tmp, TEMP ktmp);
3848   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
3849   ins_encode %{
3850     assert(UseAVX > 2, "sanity");
3851 
3852     int vlen_enc = vector_length_encoding(this);
3853     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3854 
3855     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3856 
3857     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3858     __ lea($tmp$$Register, $mem$$Address);
3859     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
3860   %}
3861   ins_pipe( pipe_slow );
3862 %}
3863 
3864 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
3865   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
3866   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
3867   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
3868   ins_encode %{
3869     assert(UseAVX > 2, "sanity");
3870     int vlen_enc = vector_length_encoding(this);
3871     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3872     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3873     // Note: Since gather instruction partially updates the opmask register used
3874     // for predication hense moving mask operand to a temporary.
3875     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
3876     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3877     __ lea($tmp$$Register, $mem$$Address);
3878     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
3879   %}
3880   ins_pipe( pipe_slow );
3881 %}
3882 // ====================Scatter=======================================
3883 
3884 // Scatter INT, LONG, FLOAT, DOUBLE
3885 
3886 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
3887   predicate(UseAVX > 2);
3888   match(Set mem (StoreVectorScatter mem (Binary src idx)));
3889   effect(TEMP tmp, TEMP ktmp);
3890   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
3891   ins_encode %{
3892     int vlen_enc = vector_length_encoding(this, $src);
3893     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
3894 
3895     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
3896     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3897 
3898     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3899     __ lea($tmp$$Register, $mem$$Address);
3900     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
3901   %}
3902   ins_pipe( pipe_slow );
3903 %}
3904 
3905 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
3906   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
3907   effect(TEMP tmp, TEMP ktmp);
3908   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
3909   ins_encode %{
3910     int vlen_enc = vector_length_encoding(this, $src);
3911     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
3912     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
3913     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3914     // Note: Since scatter instruction partially updates the opmask register used
3915     // for predication hense moving mask operand to a temporary.
3916     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
3917     __ lea($tmp$$Register, $mem$$Address);
3918     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
3919   %}
3920   ins_pipe( pipe_slow );
3921 %}
3922 
3923 // ====================REPLICATE=======================================
3924 
3925 // Replicate byte scalar to be vector
3926 instruct ReplB_reg(vec dst, rRegI src) %{
3927   match(Set dst (ReplicateB src));
3928   format %{ "replicateB $dst,$src" %}
3929   ins_encode %{
3930     uint vlen = Matcher::vector_length(this);
3931     if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
3932       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
3933       int vlen_enc = vector_length_encoding(this);
3934       __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
3935     } else if (VM_Version::supports_avx2()) {
3936       int vlen_enc = vector_length_encoding(this);
3937       __ movdl($dst$$XMMRegister, $src$$Register);
3938       __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3939     } else {
3940       __ movdl($dst$$XMMRegister, $src$$Register);
3941       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3942       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3943       if (vlen >= 16) {
3944         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3945         if (vlen >= 32) {
3946           assert(vlen == 32, "sanity");
3947           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3948         }
3949       }
3950     }
3951   %}
3952   ins_pipe( pipe_slow );
3953 %}
3954 
3955 instruct ReplB_mem(vec dst, memory mem) %{
3956   predicate(VM_Version::supports_avx2());
3957   match(Set dst (ReplicateB (LoadB mem)));
3958   format %{ "replicateB $dst,$mem" %}
3959   ins_encode %{
3960     int vlen_enc = vector_length_encoding(this);
3961     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
3962   %}
3963   ins_pipe( pipe_slow );
3964 %}
3965 
3966 instruct ReplB_imm(vec dst, immI con) %{
3967   match(Set dst (ReplicateB con));
3968   format %{ "replicateB $dst,$con" %}
3969   ins_encode %{
3970     uint vlen = Matcher::vector_length(this);
3971     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 1));
3972     if (vlen == 4) {
3973       __ movdl($dst$$XMMRegister, const_addr);
3974     } else {
3975       __ movq($dst$$XMMRegister, const_addr);
3976       if (vlen >= 16) {
3977         if (VM_Version::supports_avx2()) {
3978           int vlen_enc = vector_length_encoding(this);
3979           __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3980         } else {
3981           assert(vlen == 16, "sanity");
3982           __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3983         }
3984       }
3985     }
3986   %}
3987   ins_pipe( pipe_slow );
3988 %}
3989 
3990 // Replicate byte scalar zero to be vector
3991 instruct ReplB_zero(vec dst, immI_0 zero) %{
3992   match(Set dst (ReplicateB zero));
3993   format %{ "replicateB $dst,$zero" %}
3994   ins_encode %{
3995     uint vlen = Matcher::vector_length(this);
3996     if (vlen <= 16) {
3997       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3998     } else {
3999       // Use vpxor since AVX512F does not have 512bit vxorpd (requires AVX512DQ).
4000       int vlen_enc = vector_length_encoding(this);
4001       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4002     }
4003   %}
4004   ins_pipe( fpu_reg_reg );
4005 %}
4006 
4007 // ====================ReplicateS=======================================
4008 
4009 instruct ReplS_reg(vec dst, rRegI src) %{
4010   match(Set dst (ReplicateS src));
4011   format %{ "replicateS $dst,$src" %}
4012   ins_encode %{
4013     uint vlen = Matcher::vector_length(this);
4014     if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
4015       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
4016       int vlen_enc = vector_length_encoding(this);
4017       __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
4018     } else if (VM_Version::supports_avx2()) {
4019       int vlen_enc = vector_length_encoding(this);
4020       __ movdl($dst$$XMMRegister, $src$$Register);
4021       __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4022     } else {
4023       __ movdl($dst$$XMMRegister, $src$$Register);
4024       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4025       if (vlen >= 8) {
4026         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4027         if (vlen >= 16) {
4028           assert(vlen == 16, "sanity");
4029           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4030         }
4031       }
4032     }
4033   %}
4034   ins_pipe( pipe_slow );
4035 %}
4036 
4037 instruct ReplS_mem(vec dst, memory mem) %{
4038   predicate(VM_Version::supports_avx2());
4039   match(Set dst (ReplicateS (LoadS mem)));
4040   format %{ "replicateS $dst,$mem" %}
4041   ins_encode %{
4042     int vlen_enc = vector_length_encoding(this);
4043     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
4044   %}
4045   ins_pipe( pipe_slow );
4046 %}
4047 
4048 instruct ReplS_imm(vec dst, immI con) %{
4049   match(Set dst (ReplicateS con));
4050   format %{ "replicateS $dst,$con" %}
4051   ins_encode %{
4052     uint vlen = Matcher::vector_length(this);
4053     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 2));
4054     if (vlen == 2) {
4055       __ movdl($dst$$XMMRegister, const_addr);
4056     } else {
4057       __ movq($dst$$XMMRegister, const_addr);
4058       if (vlen >= 8) {
4059         if (VM_Version::supports_avx2()) {
4060           int vlen_enc = vector_length_encoding(this);
4061           __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4062         } else {
4063           assert(vlen == 8, "sanity");
4064           __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4065         }
4066       }
4067     }
4068   %}
4069   ins_pipe( fpu_reg_reg );
4070 %}
4071 
4072 instruct ReplS_zero(vec dst, immI_0 zero) %{
4073   match(Set dst (ReplicateS zero));
4074   format %{ "replicateS $dst,$zero" %}
4075   ins_encode %{
4076     uint vlen = Matcher::vector_length(this);
4077     if (vlen <= 8) {
4078       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4079     } else {
4080       int vlen_enc = vector_length_encoding(this);
4081       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4082     }
4083   %}
4084   ins_pipe( fpu_reg_reg );
4085 %}
4086 
4087 // ====================ReplicateI=======================================
4088 
4089 instruct ReplI_reg(vec dst, rRegI src) %{
4090   match(Set dst (ReplicateI src));
4091   format %{ "replicateI $dst,$src" %}
4092   ins_encode %{
4093     uint vlen = Matcher::vector_length(this);
4094     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4095       int vlen_enc = vector_length_encoding(this);
4096       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
4097     } else if (VM_Version::supports_avx2()) {
4098       int vlen_enc = vector_length_encoding(this);
4099       __ movdl($dst$$XMMRegister, $src$$Register);
4100       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4101     } else {
4102       __ movdl($dst$$XMMRegister, $src$$Register);
4103       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4104       if (vlen >= 8) {
4105         assert(vlen == 8, "sanity");
4106         __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4107       }
4108     }
4109   %}
4110   ins_pipe( pipe_slow );
4111 %}
4112 
4113 instruct ReplI_mem(vec dst, memory mem) %{
4114   match(Set dst (ReplicateI (LoadI mem)));
4115   format %{ "replicateI $dst,$mem" %}
4116   ins_encode %{
4117     uint vlen = Matcher::vector_length(this);
4118     if (vlen <= 4) {
4119       __ movdl($dst$$XMMRegister, $mem$$Address);
4120       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4121     } else {
4122       assert(VM_Version::supports_avx2(), "sanity");
4123       int vlen_enc = vector_length_encoding(this);
4124       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4125     }
4126   %}
4127   ins_pipe( pipe_slow );
4128 %}
4129 
4130 instruct ReplI_imm(vec dst, immI con) %{
4131   match(Set dst (ReplicateI con));
4132   format %{ "replicateI $dst,$con" %}
4133   ins_encode %{
4134     uint vlen = Matcher::vector_length(this);
4135     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 4));
4136     if (vlen <= 4) {
4137       __ movq($dst$$XMMRegister, const_addr);
4138       if (vlen == 4) {
4139         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4140       }
4141     } else {
4142       assert(VM_Version::supports_avx2(), "sanity");
4143       int vlen_enc = vector_length_encoding(this);
4144       __ movq($dst$$XMMRegister, const_addr);
4145       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4146     }
4147   %}
4148   ins_pipe( pipe_slow );
4149 %}
4150 
4151 // Replicate integer (4 byte) scalar zero to be vector
4152 instruct ReplI_zero(vec dst, immI_0 zero) %{
4153   match(Set dst (ReplicateI zero));
4154   format %{ "replicateI $dst,$zero" %}
4155   ins_encode %{
4156     uint vlen = Matcher::vector_length(this);
4157     if (vlen <= 4) {
4158       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4159     } else {
4160       int vlen_enc = vector_length_encoding(this);
4161       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4162     }
4163   %}
4164   ins_pipe( fpu_reg_reg );
4165 %}
4166 
4167 instruct ReplI_M1(vec dst, immI_M1 con) %{
4168   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) >= 16);
4169   match(Set dst (ReplicateB con));
4170   match(Set dst (ReplicateS con));
4171   match(Set dst (ReplicateI con));
4172   effect(TEMP dst);
4173   format %{ "vallones $dst" %}
4174   ins_encode %{
4175     int vector_len = vector_length_encoding(this);
4176     __ vallones($dst$$XMMRegister, vector_len);
4177   %}
4178   ins_pipe( pipe_slow );
4179 %}
4180 
4181 // ====================ReplicateL=======================================
4182 
4183 #ifdef _LP64
4184 // Replicate long (8 byte) scalar to be vector
4185 instruct ReplL_reg(vec dst, rRegL src) %{
4186   match(Set dst (ReplicateL src));
4187   format %{ "replicateL $dst,$src" %}
4188   ins_encode %{
4189     uint vlen = Matcher::vector_length(this);
4190     if (vlen == 2) {
4191       __ movdq($dst$$XMMRegister, $src$$Register);
4192       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4193     } else if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4194       int vlen_enc = vector_length_encoding(this);
4195       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
4196     } else if (VM_Version::supports_avx2()) {
4197       assert(vlen == 4, "sanity");
4198       int vlen_enc = vector_length_encoding(this);
4199       __ movdq($dst$$XMMRegister, $src$$Register);
4200       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4201     } else {
4202       assert(vlen == 4, "sanity");
4203       __ movdq($dst$$XMMRegister, $src$$Register);
4204       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4205       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4206     }
4207   %}
4208   ins_pipe( pipe_slow );
4209 %}
4210 #else // _LP64
4211 // Replicate long (8 byte) scalar to be vector
4212 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
4213   predicate(Matcher::vector_length(n) <= 4);
4214   match(Set dst (ReplicateL src));
4215   effect(TEMP dst, USE src, TEMP tmp);
4216   format %{ "replicateL $dst,$src" %}
4217   ins_encode %{
4218     uint vlen = Matcher::vector_length(this);
4219     if (vlen == 2) {
4220       __ movdl($dst$$XMMRegister, $src$$Register);
4221       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4222       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4223       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4224     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4225       int vlen_enc = Assembler::AVX_256bit;
4226       __ movdl($dst$$XMMRegister, $src$$Register);
4227       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4228       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4229       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4230     } else {
4231       __ movdl($dst$$XMMRegister, $src$$Register);
4232       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4233       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4234       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4235       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4236     }
4237   %}
4238   ins_pipe( pipe_slow );
4239 %}
4240 
4241 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
4242   predicate(Matcher::vector_length(n) == 8);
4243   match(Set dst (ReplicateL src));
4244   effect(TEMP dst, USE src, TEMP tmp);
4245   format %{ "replicateL $dst,$src" %}
4246   ins_encode %{
4247     if (VM_Version::supports_avx512vl()) {
4248       __ movdl($dst$$XMMRegister, $src$$Register);
4249       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4250       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4251       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4252       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4253       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4254     } else {
4255       int vlen_enc = Assembler::AVX_512bit;
4256       __ movdl($dst$$XMMRegister, $src$$Register);
4257       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4258       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4259       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4260     }
4261   %}
4262   ins_pipe( pipe_slow );
4263 %}
4264 #endif // _LP64
4265 
4266 instruct ReplL_mem(vec dst, memory mem) %{
4267   match(Set dst (ReplicateL (LoadL mem)));
4268   format %{ "replicateL $dst,$mem" %}
4269   ins_encode %{
4270     uint vlen = Matcher::vector_length(this);
4271     if (vlen == 2) {
4272       __ movq($dst$$XMMRegister, $mem$$Address);
4273       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4274     } else {
4275       assert(VM_Version::supports_avx2(), "sanity");
4276       int vlen_enc = vector_length_encoding(this);
4277       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
4278     }
4279   %}
4280   ins_pipe( pipe_slow );
4281 %}
4282 
4283 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4284 instruct ReplL_imm(vec dst, immL con) %{
4285   match(Set dst (ReplicateL con));
4286   format %{ "replicateL $dst,$con" %}
4287   ins_encode %{
4288     uint vlen = Matcher::vector_length(this);
4289     InternalAddress const_addr = $constantaddress($con);
4290     if (vlen == 2) {
4291       __ movq($dst$$XMMRegister, const_addr);
4292       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4293     } else {
4294       assert(VM_Version::supports_avx2(), "sanity");
4295       int vlen_enc = vector_length_encoding(this);
4296       __ movq($dst$$XMMRegister, const_addr);
4297       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4298     }
4299   %}
4300   ins_pipe( pipe_slow );
4301 %}
4302 
4303 instruct ReplL_zero(vec dst, immL0 zero) %{
4304   match(Set dst (ReplicateL zero));
4305   format %{ "replicateL $dst,$zero" %}
4306   ins_encode %{
4307     int vlen = Matcher::vector_length(this);
4308     if (vlen == 2) {
4309       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4310     } else {
4311       int vlen_enc = vector_length_encoding(this);
4312       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4313     }
4314   %}
4315   ins_pipe( fpu_reg_reg );
4316 %}
4317 
4318 instruct ReplL_M1(vec dst, immL_M1 con) %{
4319   predicate(UseAVX > 0);
4320   match(Set dst (ReplicateL con));
4321   effect(TEMP dst);
4322   format %{ "vallones $dst" %}
4323   ins_encode %{
4324     int vector_len = vector_length_encoding(this);
4325     __ vallones($dst$$XMMRegister, vector_len);
4326   %}
4327   ins_pipe( pipe_slow );
4328 %}
4329 
4330 // ====================ReplicateF=======================================
4331 
4332 instruct ReplF_reg(vec dst, vlRegF src) %{
4333   match(Set dst (ReplicateF src));
4334   format %{ "replicateF $dst,$src" %}
4335   ins_encode %{
4336     uint vlen = Matcher::vector_length(this);
4337     if (vlen <= 4) {
4338       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4339    } else if (VM_Version::supports_avx2()) {
4340       int vlen_enc = vector_length_encoding(this);
4341       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4342     } else {
4343       assert(vlen == 8, "sanity");
4344       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4345       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4346     }
4347   %}
4348   ins_pipe( pipe_slow );
4349 %}
4350 
4351 instruct ReplF_mem(vec dst, memory mem) %{
4352   match(Set dst (ReplicateF (LoadF mem)));
4353   format %{ "replicateF $dst,$mem" %}
4354   ins_encode %{
4355     uint vlen = Matcher::vector_length(this);
4356     if (vlen <= 4) {
4357       __ movdl($dst$$XMMRegister, $mem$$Address);
4358       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4359     } else {
4360       assert(VM_Version::supports_avx(), "sanity");
4361       int vlen_enc = vector_length_encoding(this);
4362       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
4363     }
4364   %}
4365   ins_pipe( pipe_slow );
4366 %}
4367 
4368 instruct ReplF_zero(vec dst, immF0 zero) %{
4369   match(Set dst (ReplicateF zero));
4370   format %{ "replicateF $dst,$zero" %}
4371   ins_encode %{
4372     uint vlen = Matcher::vector_length(this);
4373     if (vlen <= 4) {
4374       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4375     } else {
4376       int vlen_enc = vector_length_encoding(this);
4377       __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4378     }
4379   %}
4380   ins_pipe( fpu_reg_reg );
4381 %}
4382 
4383 // ====================ReplicateD=======================================
4384 
4385 // Replicate double (8 bytes) scalar to be vector
4386 instruct ReplD_reg(vec dst, vlRegD src) %{
4387   match(Set dst (ReplicateD src));
4388   format %{ "replicateD $dst,$src" %}
4389   ins_encode %{
4390     uint vlen = Matcher::vector_length(this);
4391     if (vlen == 2) {
4392       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4393     } else if (VM_Version::supports_avx2()) {
4394       int vlen_enc = vector_length_encoding(this);
4395       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4396     } else {
4397       assert(vlen == 4, "sanity");
4398       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4399       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4400     }
4401   %}
4402   ins_pipe( pipe_slow );
4403 %}
4404 
4405 instruct ReplD_mem(vec dst, memory mem) %{
4406   match(Set dst (ReplicateD (LoadD mem)));
4407   format %{ "replicateD $dst,$mem" %}
4408   ins_encode %{
4409     uint vlen = Matcher::vector_length(this);
4410     if (vlen == 2) {
4411       __ movq($dst$$XMMRegister, $mem$$Address);
4412       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44);
4413     } else {
4414       assert(VM_Version::supports_avx(), "sanity");
4415       int vlen_enc = vector_length_encoding(this);
4416       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4417     }
4418   %}
4419   ins_pipe( pipe_slow );
4420 %}
4421 
4422 instruct ReplD_zero(vec dst, immD0 zero) %{
4423   match(Set dst (ReplicateD zero));
4424   format %{ "replicateD $dst,$zero" %}
4425   ins_encode %{
4426     uint vlen = Matcher::vector_length(this);
4427     if (vlen == 2) {
4428       __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4429     } else {
4430       int vlen_enc = vector_length_encoding(this);
4431       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4432     }
4433   %}
4434   ins_pipe( fpu_reg_reg );
4435 %}
4436 
4437 // ====================VECTOR INSERT=======================================
4438 
4439 instruct insert(vec dst, rRegI val, immU8 idx) %{
4440   predicate(Matcher::vector_length_in_bytes(n) < 32);
4441   match(Set dst (VectorInsert (Binary dst val) idx));
4442   format %{ "vector_insert $dst,$val,$idx" %}
4443   ins_encode %{
4444     assert(UseSSE >= 4, "required");
4445     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
4446 
4447     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4448 
4449     assert(is_integral_type(elem_bt), "");
4450     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4451 
4452     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
4453   %}
4454   ins_pipe( pipe_slow );
4455 %}
4456 
4457 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
4458   predicate(Matcher::vector_length_in_bytes(n) == 32);
4459   match(Set dst (VectorInsert (Binary src val) idx));
4460   effect(TEMP vtmp);
4461   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4462   ins_encode %{
4463     int vlen_enc = Assembler::AVX_256bit;
4464     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4465     int elem_per_lane = 16/type2aelembytes(elem_bt);
4466     int log2epr = log2(elem_per_lane);
4467 
4468     assert(is_integral_type(elem_bt), "sanity");
4469     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4470 
4471     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4472     uint y_idx = ($idx$$constant >> log2epr) & 1;
4473     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4474     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4475     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4476   %}
4477   ins_pipe( pipe_slow );
4478 %}
4479 
4480 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
4481   predicate(Matcher::vector_length_in_bytes(n) == 64);
4482   match(Set dst (VectorInsert (Binary src val) idx));
4483   effect(TEMP vtmp);
4484   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4485   ins_encode %{
4486     assert(UseAVX > 2, "sanity");
4487 
4488     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4489     int elem_per_lane = 16/type2aelembytes(elem_bt);
4490     int log2epr = log2(elem_per_lane);
4491 
4492     assert(is_integral_type(elem_bt), "");
4493     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4494 
4495     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4496     uint y_idx = ($idx$$constant >> log2epr) & 3;
4497     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4498     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4499     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4500   %}
4501   ins_pipe( pipe_slow );
4502 %}
4503 
4504 #ifdef _LP64
4505 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
4506   predicate(Matcher::vector_length(n) == 2);
4507   match(Set dst (VectorInsert (Binary dst val) idx));
4508   format %{ "vector_insert $dst,$val,$idx" %}
4509   ins_encode %{
4510     assert(UseSSE >= 4, "required");
4511     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4512     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4513 
4514     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
4515   %}
4516   ins_pipe( pipe_slow );
4517 %}
4518 
4519 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
4520   predicate(Matcher::vector_length(n) == 4);
4521   match(Set dst (VectorInsert (Binary src val) idx));
4522   effect(TEMP vtmp);
4523   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4524   ins_encode %{
4525     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4526     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4527 
4528     uint x_idx = $idx$$constant & right_n_bits(1);
4529     uint y_idx = ($idx$$constant >> 1) & 1;
4530     int vlen_enc = Assembler::AVX_256bit;
4531     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4532     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4533     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4534   %}
4535   ins_pipe( pipe_slow );
4536 %}
4537 
4538 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
4539   predicate(Matcher::vector_length(n) == 8);
4540   match(Set dst (VectorInsert (Binary src val) idx));
4541   effect(TEMP vtmp);
4542   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4543   ins_encode %{
4544     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
4545     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4546 
4547     uint x_idx = $idx$$constant & right_n_bits(1);
4548     uint y_idx = ($idx$$constant >> 1) & 3;
4549     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4550     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4551     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4552   %}
4553   ins_pipe( pipe_slow );
4554 %}
4555 #endif
4556 
4557 instruct insertF(vec dst, regF val, immU8 idx) %{
4558   predicate(Matcher::vector_length(n) < 8);
4559   match(Set dst (VectorInsert (Binary dst val) idx));
4560   format %{ "vector_insert $dst,$val,$idx" %}
4561   ins_encode %{
4562     assert(UseSSE >= 4, "sanity");
4563 
4564     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4565     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4566 
4567     __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
4568   %}
4569   ins_pipe( pipe_slow );
4570 %}
4571 
4572 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
4573   predicate(Matcher::vector_length(n) >= 8);
4574   match(Set dst (VectorInsert (Binary src val) idx));
4575   effect(TEMP vtmp);
4576   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4577   ins_encode %{
4578     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4579     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4580 
4581     int vlen = Matcher::vector_length(this);
4582     uint x_idx = $idx$$constant & right_n_bits(2);
4583     if (vlen == 8) {
4584       uint y_idx = ($idx$$constant >> 2) & 1;
4585       int vlen_enc = Assembler::AVX_256bit;
4586       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4587       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4588       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4589     } else {
4590       assert(vlen == 16, "sanity");
4591       uint y_idx = ($idx$$constant >> 2) & 3;
4592       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4593       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4594       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4595     }
4596   %}
4597   ins_pipe( pipe_slow );
4598 %}
4599 
4600 #ifdef _LP64
4601 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
4602   predicate(Matcher::vector_length(n) == 2);
4603   match(Set dst (VectorInsert (Binary dst val) idx));
4604   effect(TEMP tmp);
4605   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
4606   ins_encode %{
4607     assert(UseSSE >= 4, "sanity");
4608     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4609     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4610 
4611     __ movq($tmp$$Register, $val$$XMMRegister);
4612     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
4613   %}
4614   ins_pipe( pipe_slow );
4615 %}
4616 
4617 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
4618   predicate(Matcher::vector_length(n) == 4);
4619   match(Set dst (VectorInsert (Binary src val) idx));
4620   effect(TEMP vtmp, TEMP tmp);
4621   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
4622   ins_encode %{
4623     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4624     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4625 
4626     uint x_idx = $idx$$constant & right_n_bits(1);
4627     uint y_idx = ($idx$$constant >> 1) & 1;
4628     int vlen_enc = Assembler::AVX_256bit;
4629     __ movq($tmp$$Register, $val$$XMMRegister);
4630     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4631     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4632     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4633   %}
4634   ins_pipe( pipe_slow );
4635 %}
4636 
4637 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
4638   predicate(Matcher::vector_length(n) == 8);
4639   match(Set dst (VectorInsert (Binary src val) idx));
4640   effect(TEMP tmp, TEMP vtmp);
4641   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4642   ins_encode %{
4643     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4644     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4645 
4646     uint x_idx = $idx$$constant & right_n_bits(1);
4647     uint y_idx = ($idx$$constant >> 1) & 3;
4648     __ movq($tmp$$Register, $val$$XMMRegister);
4649     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4650     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4651     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4652   %}
4653   ins_pipe( pipe_slow );
4654 %}
4655 #endif
4656 
4657 // ====================REDUCTION ARITHMETIC=======================================
4658 
4659 // =======================Int Reduction==========================================
4660 
4661 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4662   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
4663   match(Set dst (AddReductionVI src1 src2));
4664   match(Set dst (MulReductionVI src1 src2));
4665   match(Set dst (AndReductionV  src1 src2));
4666   match(Set dst ( OrReductionV  src1 src2));
4667   match(Set dst (XorReductionV  src1 src2));
4668   match(Set dst (MinReductionV  src1 src2));
4669   match(Set dst (MaxReductionV  src1 src2));
4670   effect(TEMP vtmp1, TEMP vtmp2);
4671   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4672   ins_encode %{
4673     int opcode = this->ideal_Opcode();
4674     int vlen = Matcher::vector_length(this, $src2);
4675     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4676   %}
4677   ins_pipe( pipe_slow );
4678 %}
4679 
4680 // =======================Long Reduction==========================================
4681 
4682 #ifdef _LP64
4683 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4684   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
4685   match(Set dst (AddReductionVL src1 src2));
4686   match(Set dst (MulReductionVL src1 src2));
4687   match(Set dst (AndReductionV  src1 src2));
4688   match(Set dst ( OrReductionV  src1 src2));
4689   match(Set dst (XorReductionV  src1 src2));
4690   match(Set dst (MinReductionV  src1 src2));
4691   match(Set dst (MaxReductionV  src1 src2));
4692   effect(TEMP vtmp1, TEMP vtmp2);
4693   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4694   ins_encode %{
4695     int opcode = this->ideal_Opcode();
4696     int vlen = Matcher::vector_length(this, $src2);
4697     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4698   %}
4699   ins_pipe( pipe_slow );
4700 %}
4701 
4702 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
4703   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
4704   match(Set dst (AddReductionVL src1 src2));
4705   match(Set dst (MulReductionVL src1 src2));
4706   match(Set dst (AndReductionV  src1 src2));
4707   match(Set dst ( OrReductionV  src1 src2));
4708   match(Set dst (XorReductionV  src1 src2));
4709   match(Set dst (MinReductionV  src1 src2));
4710   match(Set dst (MaxReductionV  src1 src2));
4711   effect(TEMP vtmp1, TEMP vtmp2);
4712   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4713   ins_encode %{
4714     int opcode = this->ideal_Opcode();
4715     int vlen = Matcher::vector_length(this, $src2);
4716     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4717   %}
4718   ins_pipe( pipe_slow );
4719 %}
4720 #endif // _LP64
4721 
4722 // =======================Float Reduction==========================================
4723 
4724 instruct reductionF128(regF dst, vec src, vec vtmp) %{
4725   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
4726   match(Set dst (AddReductionVF dst src));
4727   match(Set dst (MulReductionVF dst src));
4728   effect(TEMP dst, TEMP vtmp);
4729   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
4730   ins_encode %{
4731     int opcode = this->ideal_Opcode();
4732     int vlen = Matcher::vector_length(this, $src);
4733     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4734   %}
4735   ins_pipe( pipe_slow );
4736 %}
4737 
4738 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
4739   predicate(Matcher::vector_length(n->in(2)) == 8); // src
4740   match(Set dst (AddReductionVF dst src));
4741   match(Set dst (MulReductionVF dst src));
4742   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4743   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4744   ins_encode %{
4745     int opcode = this->ideal_Opcode();
4746     int vlen = Matcher::vector_length(this, $src);
4747     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4748   %}
4749   ins_pipe( pipe_slow );
4750 %}
4751 
4752 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4753   predicate(Matcher::vector_length(n->in(2)) == 16); // src
4754   match(Set dst (AddReductionVF dst src));
4755   match(Set dst (MulReductionVF dst src));
4756   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4757   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4758   ins_encode %{
4759     int opcode = this->ideal_Opcode();
4760     int vlen = Matcher::vector_length(this, $src);
4761     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4762   %}
4763   ins_pipe( pipe_slow );
4764 %}
4765 
4766 // =======================Double Reduction==========================================
4767 
4768 instruct reduction2D(regD dst, vec src, vec vtmp) %{
4769   predicate(Matcher::vector_length(n->in(2)) == 2); // src
4770   match(Set dst (AddReductionVD dst src));
4771   match(Set dst (MulReductionVD dst src));
4772   effect(TEMP dst, TEMP vtmp);
4773   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
4774   ins_encode %{
4775     int opcode = this->ideal_Opcode();
4776     int vlen = Matcher::vector_length(this, $src);
4777     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4778 %}
4779   ins_pipe( pipe_slow );
4780 %}
4781 
4782 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
4783   predicate(Matcher::vector_length(n->in(2)) == 4); // src
4784   match(Set dst (AddReductionVD dst src));
4785   match(Set dst (MulReductionVD dst src));
4786   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4787   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4788   ins_encode %{
4789     int opcode = this->ideal_Opcode();
4790     int vlen = Matcher::vector_length(this, $src);
4791     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4792   %}
4793   ins_pipe( pipe_slow );
4794 %}
4795 
4796 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4797   predicate(Matcher::vector_length(n->in(2)) == 8); // src
4798   match(Set dst (AddReductionVD dst src));
4799   match(Set dst (MulReductionVD dst src));
4800   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4801   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4802   ins_encode %{
4803     int opcode = this->ideal_Opcode();
4804     int vlen = Matcher::vector_length(this, $src);
4805     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4806   %}
4807   ins_pipe( pipe_slow );
4808 %}
4809 
4810 // =======================Byte Reduction==========================================
4811 
4812 #ifdef _LP64
4813 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4814   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
4815   match(Set dst (AddReductionVI src1 src2));
4816   match(Set dst (AndReductionV  src1 src2));
4817   match(Set dst ( OrReductionV  src1 src2));
4818   match(Set dst (XorReductionV  src1 src2));
4819   match(Set dst (MinReductionV  src1 src2));
4820   match(Set dst (MaxReductionV  src1 src2));
4821   effect(TEMP vtmp1, TEMP vtmp2);
4822   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4823   ins_encode %{
4824     int opcode = this->ideal_Opcode();
4825     int vlen = Matcher::vector_length(this, $src2);
4826     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4827   %}
4828   ins_pipe( pipe_slow );
4829 %}
4830 
4831 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4832   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
4833   match(Set dst (AddReductionVI src1 src2));
4834   match(Set dst (AndReductionV  src1 src2));
4835   match(Set dst ( OrReductionV  src1 src2));
4836   match(Set dst (XorReductionV  src1 src2));
4837   match(Set dst (MinReductionV  src1 src2));
4838   match(Set dst (MaxReductionV  src1 src2));
4839   effect(TEMP vtmp1, TEMP vtmp2);
4840   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4841   ins_encode %{
4842     int opcode = this->ideal_Opcode();
4843     int vlen = Matcher::vector_length(this, $src2);
4844     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4845   %}
4846   ins_pipe( pipe_slow );
4847 %}
4848 #endif
4849 
4850 // =======================Short Reduction==========================================
4851 
4852 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4853   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
4854   match(Set dst (AddReductionVI src1 src2));
4855   match(Set dst (MulReductionVI src1 src2));
4856   match(Set dst (AndReductionV  src1 src2));
4857   match(Set dst ( OrReductionV  src1 src2));
4858   match(Set dst (XorReductionV  src1 src2));
4859   match(Set dst (MinReductionV  src1 src2));
4860   match(Set dst (MaxReductionV  src1 src2));
4861   effect(TEMP vtmp1, TEMP vtmp2);
4862   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4863   ins_encode %{
4864     int opcode = this->ideal_Opcode();
4865     int vlen = Matcher::vector_length(this, $src2);
4866     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4867   %}
4868   ins_pipe( pipe_slow );
4869 %}
4870 
4871 // =======================Mul Reduction==========================================
4872 
4873 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4874   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
4875             Matcher::vector_length(n->in(2)) <= 32); // src2
4876   match(Set dst (MulReductionVI src1 src2));
4877   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4878   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4879   ins_encode %{
4880     int opcode = this->ideal_Opcode();
4881     int vlen = Matcher::vector_length(this, $src2);
4882     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4883   %}
4884   ins_pipe( pipe_slow );
4885 %}
4886 
4887 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4888   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
4889             Matcher::vector_length(n->in(2)) == 64); // src2
4890   match(Set dst (MulReductionVI src1 src2));
4891   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4892   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4893   ins_encode %{
4894     int opcode = this->ideal_Opcode();
4895     int vlen = Matcher::vector_length(this, $src2);
4896     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4897   %}
4898   ins_pipe( pipe_slow );
4899 %}
4900 
4901 //--------------------Min/Max Float Reduction --------------------
4902 // Float Min Reduction
4903 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
4904                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4905   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4906             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4907              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4908             Matcher::vector_length(n->in(2)) == 2);
4909   match(Set dst (MinReductionV src1 src2));
4910   match(Set dst (MaxReductionV src1 src2));
4911   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4912   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4913   ins_encode %{
4914     assert(UseAVX > 0, "sanity");
4915 
4916     int opcode = this->ideal_Opcode();
4917     int vlen = Matcher::vector_length(this, $src2);
4918     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4919                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4920   %}
4921   ins_pipe( pipe_slow );
4922 %}
4923 
4924 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
4925                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4926   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4927             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4928              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4929             Matcher::vector_length(n->in(2)) >= 4);
4930   match(Set dst (MinReductionV src1 src2));
4931   match(Set dst (MaxReductionV src1 src2));
4932   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4933   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4934   ins_encode %{
4935     assert(UseAVX > 0, "sanity");
4936 
4937     int opcode = this->ideal_Opcode();
4938     int vlen = Matcher::vector_length(this, $src2);
4939     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4940                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4941   %}
4942   ins_pipe( pipe_slow );
4943 %}
4944 
4945 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
4946                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4947   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4948             Matcher::vector_length(n->in(2)) == 2);
4949   match(Set dst (MinReductionV dst src));
4950   match(Set dst (MaxReductionV dst src));
4951   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4952   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4953   ins_encode %{
4954     assert(UseAVX > 0, "sanity");
4955 
4956     int opcode = this->ideal_Opcode();
4957     int vlen = Matcher::vector_length(this, $src);
4958     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4959                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4960   %}
4961   ins_pipe( pipe_slow );
4962 %}
4963 
4964 
4965 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
4966                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4967   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4968             Matcher::vector_length(n->in(2)) >= 4);
4969   match(Set dst (MinReductionV dst src));
4970   match(Set dst (MaxReductionV dst src));
4971   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4972   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4973   ins_encode %{
4974     assert(UseAVX > 0, "sanity");
4975 
4976     int opcode = this->ideal_Opcode();
4977     int vlen = Matcher::vector_length(this, $src);
4978     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4979                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4980   %}
4981   ins_pipe( pipe_slow );
4982 %}
4983 
4984 
4985 //--------------------Min Double Reduction --------------------
4986 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
4987                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
4988                             rFlagsReg cr) %{
4989   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4990             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
4991              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
4992             Matcher::vector_length(n->in(2)) == 2);
4993   match(Set dst (MinReductionV src1 src2));
4994   match(Set dst (MaxReductionV src1 src2));
4995   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
4996   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
4997   ins_encode %{
4998     assert(UseAVX > 0, "sanity");
4999 
5000     int opcode = this->ideal_Opcode();
5001     int vlen = Matcher::vector_length(this, $src2);
5002     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
5003                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
5004   %}
5005   ins_pipe( pipe_slow );
5006 %}
5007 
5008 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
5009                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
5010                            rFlagsReg cr) %{
5011   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5012             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
5013              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
5014             Matcher::vector_length(n->in(2)) >= 4);
5015   match(Set dst (MinReductionV src1 src2));
5016   match(Set dst (MaxReductionV src1 src2));
5017   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
5018   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
5019   ins_encode %{
5020     assert(UseAVX > 0, "sanity");
5021 
5022     int opcode = this->ideal_Opcode();
5023     int vlen = Matcher::vector_length(this, $src2);
5024     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
5025                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
5026   %}
5027   ins_pipe( pipe_slow );
5028 %}
5029 
5030 
5031 instruct minmax_reduction2D_av(legRegD dst, legVec src,
5032                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
5033                                rFlagsReg cr) %{
5034   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5035             Matcher::vector_length(n->in(2)) == 2);
5036   match(Set dst (MinReductionV dst src));
5037   match(Set dst (MaxReductionV dst src));
5038   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
5039   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
5040   ins_encode %{
5041     assert(UseAVX > 0, "sanity");
5042 
5043     int opcode = this->ideal_Opcode();
5044     int vlen = Matcher::vector_length(this, $src);
5045     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
5046                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
5047   %}
5048   ins_pipe( pipe_slow );
5049 %}
5050 
5051 instruct minmax_reductionD_av(legRegD dst, legVec src,
5052                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
5053                               rFlagsReg cr) %{
5054   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5055             Matcher::vector_length(n->in(2)) >= 4);
5056   match(Set dst (MinReductionV dst src));
5057   match(Set dst (MaxReductionV dst src));
5058   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
5059   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
5060   ins_encode %{
5061     assert(UseAVX > 0, "sanity");
5062 
5063     int opcode = this->ideal_Opcode();
5064     int vlen = Matcher::vector_length(this, $src);
5065     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
5066                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
5067   %}
5068   ins_pipe( pipe_slow );
5069 %}
5070 
5071 // ====================VECTOR ARITHMETIC=======================================
5072 
5073 // --------------------------------- ADD --------------------------------------
5074 
5075 // Bytes vector add
5076 instruct vaddB(vec dst, vec src) %{
5077   predicate(UseAVX == 0);
5078   match(Set dst (AddVB dst src));
5079   format %{ "paddb   $dst,$src\t! add packedB" %}
5080   ins_encode %{
5081     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5082   %}
5083   ins_pipe( pipe_slow );
5084 %}
5085 
5086 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
5087   predicate(UseAVX > 0);
5088   match(Set dst (AddVB src1 src2));
5089   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
5090   ins_encode %{
5091     int vlen_enc = vector_length_encoding(this);
5092     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5093   %}
5094   ins_pipe( pipe_slow );
5095 %}
5096 
5097 instruct vaddB_mem(vec dst, vec src, memory mem) %{
5098   predicate((UseAVX > 0) &&
5099             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5100   match(Set dst (AddVB src (LoadVector mem)));
5101   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
5102   ins_encode %{
5103     int vlen_enc = vector_length_encoding(this);
5104     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5105   %}
5106   ins_pipe( pipe_slow );
5107 %}
5108 
5109 // Shorts/Chars vector add
5110 instruct vaddS(vec dst, vec src) %{
5111   predicate(UseAVX == 0);
5112   match(Set dst (AddVS dst src));
5113   format %{ "paddw   $dst,$src\t! add packedS" %}
5114   ins_encode %{
5115     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5116   %}
5117   ins_pipe( pipe_slow );
5118 %}
5119 
5120 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
5121   predicate(UseAVX > 0);
5122   match(Set dst (AddVS src1 src2));
5123   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
5124   ins_encode %{
5125     int vlen_enc = vector_length_encoding(this);
5126     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5127   %}
5128   ins_pipe( pipe_slow );
5129 %}
5130 
5131 instruct vaddS_mem(vec dst, vec src, memory mem) %{
5132   predicate((UseAVX > 0) &&
5133             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5134   match(Set dst (AddVS src (LoadVector mem)));
5135   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
5136   ins_encode %{
5137     int vlen_enc = vector_length_encoding(this);
5138     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5139   %}
5140   ins_pipe( pipe_slow );
5141 %}
5142 
5143 // Integers vector add
5144 instruct vaddI(vec dst, vec src) %{
5145   predicate(UseAVX == 0);
5146   match(Set dst (AddVI dst src));
5147   format %{ "paddd   $dst,$src\t! add packedI" %}
5148   ins_encode %{
5149     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
5150   %}
5151   ins_pipe( pipe_slow );
5152 %}
5153 
5154 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
5155   predicate(UseAVX > 0);
5156   match(Set dst (AddVI src1 src2));
5157   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
5158   ins_encode %{
5159     int vlen_enc = vector_length_encoding(this);
5160     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5161   %}
5162   ins_pipe( pipe_slow );
5163 %}
5164 
5165 
5166 instruct vaddI_mem(vec dst, vec src, memory mem) %{
5167   predicate((UseAVX > 0) &&
5168             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5169   match(Set dst (AddVI src (LoadVector mem)));
5170   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
5171   ins_encode %{
5172     int vlen_enc = vector_length_encoding(this);
5173     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5174   %}
5175   ins_pipe( pipe_slow );
5176 %}
5177 
5178 // Longs vector add
5179 instruct vaddL(vec dst, vec src) %{
5180   predicate(UseAVX == 0);
5181   match(Set dst (AddVL dst src));
5182   format %{ "paddq   $dst,$src\t! add packedL" %}
5183   ins_encode %{
5184     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
5185   %}
5186   ins_pipe( pipe_slow );
5187 %}
5188 
5189 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
5190   predicate(UseAVX > 0);
5191   match(Set dst (AddVL src1 src2));
5192   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
5193   ins_encode %{
5194     int vlen_enc = vector_length_encoding(this);
5195     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5196   %}
5197   ins_pipe( pipe_slow );
5198 %}
5199 
5200 instruct vaddL_mem(vec dst, vec src, memory mem) %{
5201   predicate((UseAVX > 0) &&
5202             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5203   match(Set dst (AddVL src (LoadVector mem)));
5204   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
5205   ins_encode %{
5206     int vlen_enc = vector_length_encoding(this);
5207     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5208   %}
5209   ins_pipe( pipe_slow );
5210 %}
5211 
5212 // Floats vector add
5213 instruct vaddF(vec dst, vec src) %{
5214   predicate(UseAVX == 0);
5215   match(Set dst (AddVF dst src));
5216   format %{ "addps   $dst,$src\t! add packedF" %}
5217   ins_encode %{
5218     __ addps($dst$$XMMRegister, $src$$XMMRegister);
5219   %}
5220   ins_pipe( pipe_slow );
5221 %}
5222 
5223 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
5224   predicate(UseAVX > 0);
5225   match(Set dst (AddVF src1 src2));
5226   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
5227   ins_encode %{
5228     int vlen_enc = vector_length_encoding(this);
5229     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5230   %}
5231   ins_pipe( pipe_slow );
5232 %}
5233 
5234 instruct vaddF_mem(vec dst, vec src, memory mem) %{
5235   predicate((UseAVX > 0) &&
5236             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5237   match(Set dst (AddVF src (LoadVector mem)));
5238   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
5239   ins_encode %{
5240     int vlen_enc = vector_length_encoding(this);
5241     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5242   %}
5243   ins_pipe( pipe_slow );
5244 %}
5245 
5246 // Doubles vector add
5247 instruct vaddD(vec dst, vec src) %{
5248   predicate(UseAVX == 0);
5249   match(Set dst (AddVD dst src));
5250   format %{ "addpd   $dst,$src\t! add packedD" %}
5251   ins_encode %{
5252     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
5253   %}
5254   ins_pipe( pipe_slow );
5255 %}
5256 
5257 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
5258   predicate(UseAVX > 0);
5259   match(Set dst (AddVD src1 src2));
5260   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
5261   ins_encode %{
5262     int vlen_enc = vector_length_encoding(this);
5263     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5264   %}
5265   ins_pipe( pipe_slow );
5266 %}
5267 
5268 instruct vaddD_mem(vec dst, vec src, memory mem) %{
5269   predicate((UseAVX > 0) &&
5270             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5271   match(Set dst (AddVD src (LoadVector mem)));
5272   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
5273   ins_encode %{
5274     int vlen_enc = vector_length_encoding(this);
5275     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5276   %}
5277   ins_pipe( pipe_slow );
5278 %}
5279 
5280 // --------------------------------- SUB --------------------------------------
5281 
5282 // Bytes vector sub
5283 instruct vsubB(vec dst, vec src) %{
5284   predicate(UseAVX == 0);
5285   match(Set dst (SubVB dst src));
5286   format %{ "psubb   $dst,$src\t! sub packedB" %}
5287   ins_encode %{
5288     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
5289   %}
5290   ins_pipe( pipe_slow );
5291 %}
5292 
5293 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
5294   predicate(UseAVX > 0);
5295   match(Set dst (SubVB src1 src2));
5296   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
5297   ins_encode %{
5298     int vlen_enc = vector_length_encoding(this);
5299     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5300   %}
5301   ins_pipe( pipe_slow );
5302 %}
5303 
5304 instruct vsubB_mem(vec dst, vec src, memory mem) %{
5305   predicate((UseAVX > 0) &&
5306             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5307   match(Set dst (SubVB src (LoadVector mem)));
5308   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
5309   ins_encode %{
5310     int vlen_enc = vector_length_encoding(this);
5311     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5312   %}
5313   ins_pipe( pipe_slow );
5314 %}
5315 
5316 // Shorts/Chars vector sub
5317 instruct vsubS(vec dst, vec src) %{
5318   predicate(UseAVX == 0);
5319   match(Set dst (SubVS dst src));
5320   format %{ "psubw   $dst,$src\t! sub packedS" %}
5321   ins_encode %{
5322     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
5323   %}
5324   ins_pipe( pipe_slow );
5325 %}
5326 
5327 
5328 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
5329   predicate(UseAVX > 0);
5330   match(Set dst (SubVS src1 src2));
5331   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
5332   ins_encode %{
5333     int vlen_enc = vector_length_encoding(this);
5334     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5335   %}
5336   ins_pipe( pipe_slow );
5337 %}
5338 
5339 instruct vsubS_mem(vec dst, vec src, memory mem) %{
5340   predicate((UseAVX > 0) &&
5341             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5342   match(Set dst (SubVS src (LoadVector mem)));
5343   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
5344   ins_encode %{
5345     int vlen_enc = vector_length_encoding(this);
5346     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5347   %}
5348   ins_pipe( pipe_slow );
5349 %}
5350 
5351 // Integers vector sub
5352 instruct vsubI(vec dst, vec src) %{
5353   predicate(UseAVX == 0);
5354   match(Set dst (SubVI dst src));
5355   format %{ "psubd   $dst,$src\t! sub packedI" %}
5356   ins_encode %{
5357     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
5358   %}
5359   ins_pipe( pipe_slow );
5360 %}
5361 
5362 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
5363   predicate(UseAVX > 0);
5364   match(Set dst (SubVI src1 src2));
5365   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
5366   ins_encode %{
5367     int vlen_enc = vector_length_encoding(this);
5368     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5369   %}
5370   ins_pipe( pipe_slow );
5371 %}
5372 
5373 instruct vsubI_mem(vec dst, vec src, memory mem) %{
5374   predicate((UseAVX > 0) &&
5375             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5376   match(Set dst (SubVI src (LoadVector mem)));
5377   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
5378   ins_encode %{
5379     int vlen_enc = vector_length_encoding(this);
5380     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5381   %}
5382   ins_pipe( pipe_slow );
5383 %}
5384 
5385 // Longs vector sub
5386 instruct vsubL(vec dst, vec src) %{
5387   predicate(UseAVX == 0);
5388   match(Set dst (SubVL dst src));
5389   format %{ "psubq   $dst,$src\t! sub packedL" %}
5390   ins_encode %{
5391     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
5392   %}
5393   ins_pipe( pipe_slow );
5394 %}
5395 
5396 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
5397   predicate(UseAVX > 0);
5398   match(Set dst (SubVL src1 src2));
5399   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
5400   ins_encode %{
5401     int vlen_enc = vector_length_encoding(this);
5402     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5403   %}
5404   ins_pipe( pipe_slow );
5405 %}
5406 
5407 
5408 instruct vsubL_mem(vec dst, vec src, memory mem) %{
5409   predicate((UseAVX > 0) &&
5410             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5411   match(Set dst (SubVL src (LoadVector mem)));
5412   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
5413   ins_encode %{
5414     int vlen_enc = vector_length_encoding(this);
5415     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5416   %}
5417   ins_pipe( pipe_slow );
5418 %}
5419 
5420 // Floats vector sub
5421 instruct vsubF(vec dst, vec src) %{
5422   predicate(UseAVX == 0);
5423   match(Set dst (SubVF dst src));
5424   format %{ "subps   $dst,$src\t! sub packedF" %}
5425   ins_encode %{
5426     __ subps($dst$$XMMRegister, $src$$XMMRegister);
5427   %}
5428   ins_pipe( pipe_slow );
5429 %}
5430 
5431 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
5432   predicate(UseAVX > 0);
5433   match(Set dst (SubVF src1 src2));
5434   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
5435   ins_encode %{
5436     int vlen_enc = vector_length_encoding(this);
5437     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5438   %}
5439   ins_pipe( pipe_slow );
5440 %}
5441 
5442 instruct vsubF_mem(vec dst, vec src, memory mem) %{
5443   predicate((UseAVX > 0) &&
5444             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5445   match(Set dst (SubVF src (LoadVector mem)));
5446   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
5447   ins_encode %{
5448     int vlen_enc = vector_length_encoding(this);
5449     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5450   %}
5451   ins_pipe( pipe_slow );
5452 %}
5453 
5454 // Doubles vector sub
5455 instruct vsubD(vec dst, vec src) %{
5456   predicate(UseAVX == 0);
5457   match(Set dst (SubVD dst src));
5458   format %{ "subpd   $dst,$src\t! sub packedD" %}
5459   ins_encode %{
5460     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
5461   %}
5462   ins_pipe( pipe_slow );
5463 %}
5464 
5465 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
5466   predicate(UseAVX > 0);
5467   match(Set dst (SubVD src1 src2));
5468   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
5469   ins_encode %{
5470     int vlen_enc = vector_length_encoding(this);
5471     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5472   %}
5473   ins_pipe( pipe_slow );
5474 %}
5475 
5476 instruct vsubD_mem(vec dst, vec src, memory mem) %{
5477   predicate((UseAVX > 0) &&
5478             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5479   match(Set dst (SubVD src (LoadVector mem)));
5480   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
5481   ins_encode %{
5482     int vlen_enc = vector_length_encoding(this);
5483     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5484   %}
5485   ins_pipe( pipe_slow );
5486 %}
5487 
5488 // --------------------------------- MUL --------------------------------------
5489 
5490 // Byte vector mul
5491 instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5492   predicate(Matcher::vector_length(n) == 4 ||
5493             Matcher::vector_length(n) == 8);
5494   match(Set dst (MulVB src1 src2));
5495   effect(TEMP dst, TEMP tmp, TEMP scratch);
5496   format %{"vector_mulB $dst,$src1,$src2" %}
5497   ins_encode %{
5498     assert(UseSSE > 3, "required");
5499     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
5500     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
5501     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
5502     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5503     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5504     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5505   %}
5506   ins_pipe( pipe_slow );
5507 %}
5508 
5509 instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5510   predicate(Matcher::vector_length(n) == 16 && UseAVX <= 1);
5511   match(Set dst (MulVB src1 src2));
5512   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5513   format %{"vector_mulB $dst,$src1,$src2" %}
5514   ins_encode %{
5515     assert(UseSSE > 3, "required");
5516     __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
5517     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
5518     __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
5519     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
5520     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
5521     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
5522     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
5523     __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
5524     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5525     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5526     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5527     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5528   %}
5529   ins_pipe( pipe_slow );
5530 %}
5531 
5532 instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5533   predicate(Matcher::vector_length(n) == 16 && UseAVX > 1);
5534   match(Set dst (MulVB src1 src2));
5535   effect(TEMP dst, TEMP tmp, TEMP scratch);
5536   format %{"vector_mulB $dst,$src1,$src2" %}
5537   ins_encode %{
5538   int vlen_enc = Assembler::AVX_256bit;
5539     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5540     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5541     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5542     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5543     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5544     __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
5545     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
5546   %}
5547   ins_pipe( pipe_slow );
5548 %}
5549 
5550 instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5551   predicate(Matcher::vector_length(n) == 32);
5552   match(Set dst (MulVB src1 src2));
5553   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5554   format %{"vector_mulB $dst,$src1,$src2" %}
5555   ins_encode %{
5556     assert(UseAVX > 1, "required");
5557     int vlen_enc = Assembler::AVX_256bit;
5558     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5559     __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
5560     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5561     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5562     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5563     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5564     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5565     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5566     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5567     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5568     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5569     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5570     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5571     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
5572   %}
5573   ins_pipe( pipe_slow );
5574 %}
5575 
5576 instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5577   predicate(Matcher::vector_length(n) == 64);
5578   match(Set dst (MulVB src1 src2));
5579   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5580   format %{"vector_mulB $dst,$src1,$src2\n\t" %}
5581   ins_encode %{
5582     assert(UseAVX > 2, "required");
5583     int vlen_enc = Assembler::AVX_512bit;
5584     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5585     __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
5586     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5587     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5588     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5589     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5590     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5591     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5592     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5593     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5594     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5595     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5596     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5597     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
5598     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5599   %}
5600   ins_pipe( pipe_slow );
5601 %}
5602 
5603 // Shorts/Chars vector mul
5604 instruct vmulS(vec dst, vec src) %{
5605   predicate(UseAVX == 0);
5606   match(Set dst (MulVS dst src));
5607   format %{ "pmullw $dst,$src\t! mul packedS" %}
5608   ins_encode %{
5609     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
5610   %}
5611   ins_pipe( pipe_slow );
5612 %}
5613 
5614 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
5615   predicate(UseAVX > 0);
5616   match(Set dst (MulVS src1 src2));
5617   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
5618   ins_encode %{
5619     int vlen_enc = vector_length_encoding(this);
5620     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5621   %}
5622   ins_pipe( pipe_slow );
5623 %}
5624 
5625 instruct vmulS_mem(vec dst, vec src, memory mem) %{
5626   predicate((UseAVX > 0) &&
5627             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5628   match(Set dst (MulVS src (LoadVector mem)));
5629   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
5630   ins_encode %{
5631     int vlen_enc = vector_length_encoding(this);
5632     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5633   %}
5634   ins_pipe( pipe_slow );
5635 %}
5636 
5637 // Integers vector mul
5638 instruct vmulI(vec dst, vec src) %{
5639   predicate(UseAVX == 0);
5640   match(Set dst (MulVI dst src));
5641   format %{ "pmulld  $dst,$src\t! mul packedI" %}
5642   ins_encode %{
5643     assert(UseSSE > 3, "required");
5644     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
5645   %}
5646   ins_pipe( pipe_slow );
5647 %}
5648 
5649 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
5650   predicate(UseAVX > 0);
5651   match(Set dst (MulVI src1 src2));
5652   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
5653   ins_encode %{
5654     int vlen_enc = vector_length_encoding(this);
5655     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5656   %}
5657   ins_pipe( pipe_slow );
5658 %}
5659 
5660 instruct vmulI_mem(vec dst, vec src, memory mem) %{
5661   predicate((UseAVX > 0) &&
5662             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5663   match(Set dst (MulVI src (LoadVector mem)));
5664   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
5665   ins_encode %{
5666     int vlen_enc = vector_length_encoding(this);
5667     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5668   %}
5669   ins_pipe( pipe_slow );
5670 %}
5671 
5672 // Longs vector mul
5673 instruct vmulL_reg(vec dst, vec src1, vec src2) %{
5674   predicate(VM_Version::supports_avx512dq());
5675   match(Set dst (MulVL src1 src2));
5676   format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
5677   ins_encode %{
5678     assert(UseAVX > 2, "required");
5679     int vlen_enc = vector_length_encoding(this);
5680     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5681   %}
5682   ins_pipe( pipe_slow );
5683 %}
5684 
5685 instruct vmulL_mem(vec dst, vec src, memory mem) %{
5686   predicate(VM_Version::supports_avx512dq() &&
5687               (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5688   match(Set dst (MulVL src (LoadVector mem)));
5689   format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
5690   ins_encode %{
5691     assert(UseAVX > 2, "required");
5692     int vlen_enc = vector_length_encoding(this);
5693     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5694   %}
5695   ins_pipe( pipe_slow );
5696 %}
5697 
5698 instruct mul2L_reg(vec dst, vec src2, legVec tmp) %{
5699   predicate(Matcher::vector_length(n) == 2 && !VM_Version::supports_avx512dq());
5700   match(Set dst (MulVL dst src2));
5701   effect(TEMP dst, TEMP tmp);
5702   format %{ "pshufd $tmp,$src2, 177\n\t"
5703             "pmulld $tmp,$dst\n\t"
5704             "phaddd $tmp,$tmp\n\t"
5705             "pmovzxdq $tmp,$tmp\n\t"
5706             "psllq $tmp, 32\n\t"
5707             "pmuludq $dst,$src2\n\t"
5708             "paddq $dst,$tmp\n\t! mul packed2L" %}
5709 
5710   ins_encode %{
5711     assert(VM_Version::supports_sse4_1(), "required");
5712     int vlen_enc = Assembler::AVX_128bit;
5713     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
5714     __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
5715     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5716     __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
5717     __ psllq($tmp$$XMMRegister, 32);
5718     __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
5719     __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
5720   %}
5721   ins_pipe( pipe_slow );
5722 %}
5723 
5724 instruct vmul4L_reg_avx(vec dst, vec src1, vec src2, legVec tmp, legVec tmp1) %{
5725   predicate(Matcher::vector_length(n) == 4 && !VM_Version::supports_avx512dq());
5726   match(Set dst (MulVL src1 src2));
5727   effect(TEMP tmp1, TEMP tmp);
5728   format %{ "vpshufd $tmp,$src2\n\t"
5729             "vpmulld $tmp,$src1,$tmp\n\t"
5730             "vphaddd $tmp,$tmp,$tmp\n\t"
5731             "vpmovzxdq $tmp,$tmp\n\t"
5732             "vpsllq $tmp,$tmp\n\t"
5733             "vpmuludq $tmp1,$src1,$src2\n\t"
5734             "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
5735   ins_encode %{
5736     int vlen_enc = Assembler::AVX_256bit;
5737     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vlen_enc);
5738     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5739     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
5740     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5741     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5742     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vlen_enc);
5743     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5744     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5745   %}
5746   ins_pipe( pipe_slow );
5747 %}
5748 
5749 // Floats vector mul
5750 instruct vmulF(vec dst, vec src) %{
5751   predicate(UseAVX == 0);
5752   match(Set dst (MulVF dst src));
5753   format %{ "mulps   $dst,$src\t! mul packedF" %}
5754   ins_encode %{
5755     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
5756   %}
5757   ins_pipe( pipe_slow );
5758 %}
5759 
5760 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
5761   predicate(UseAVX > 0);
5762   match(Set dst (MulVF src1 src2));
5763   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
5764   ins_encode %{
5765     int vlen_enc = vector_length_encoding(this);
5766     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5767   %}
5768   ins_pipe( pipe_slow );
5769 %}
5770 
5771 instruct vmulF_mem(vec dst, vec src, memory mem) %{
5772   predicate((UseAVX > 0) &&
5773             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5774   match(Set dst (MulVF src (LoadVector mem)));
5775   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
5776   ins_encode %{
5777     int vlen_enc = vector_length_encoding(this);
5778     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5779   %}
5780   ins_pipe( pipe_slow );
5781 %}
5782 
5783 // Doubles vector mul
5784 instruct vmulD(vec dst, vec src) %{
5785   predicate(UseAVX == 0);
5786   match(Set dst (MulVD dst src));
5787   format %{ "mulpd   $dst,$src\t! mul packedD" %}
5788   ins_encode %{
5789     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
5790   %}
5791   ins_pipe( pipe_slow );
5792 %}
5793 
5794 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
5795   predicate(UseAVX > 0);
5796   match(Set dst (MulVD src1 src2));
5797   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
5798   ins_encode %{
5799     int vlen_enc = vector_length_encoding(this);
5800     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5801   %}
5802   ins_pipe( pipe_slow );
5803 %}
5804 
5805 instruct vmulD_mem(vec dst, vec src, memory mem) %{
5806   predicate((UseAVX > 0) &&
5807             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5808   match(Set dst (MulVD src (LoadVector mem)));
5809   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
5810   ins_encode %{
5811     int vlen_enc = vector_length_encoding(this);
5812     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5813   %}
5814   ins_pipe( pipe_slow );
5815 %}
5816 
5817 instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5818   predicate(Matcher::vector_length(n) == 8);
5819   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
5820   effect(TEMP dst, USE src1, USE src2);
5821   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
5822             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
5823          %}
5824   ins_encode %{
5825     assert(UseAVX > 0, "required");
5826 
5827     int vlen_enc = Assembler::AVX_256bit;
5828     int cond = (Assembler::Condition)($copnd$$cmpcode);
5829     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5830     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5831   %}
5832   ins_pipe( pipe_slow );
5833 %}
5834 
5835 instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5836   predicate(Matcher::vector_length(n) == 4);
5837   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
5838   effect(TEMP dst, USE src1, USE src2);
5839   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
5840             "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
5841          %}
5842   ins_encode %{
5843     assert(UseAVX > 0, "required");
5844 
5845     int vlen_enc = Assembler::AVX_256bit;
5846     int cond = (Assembler::Condition)($copnd$$cmpcode);
5847     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5848     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5849   %}
5850   ins_pipe( pipe_slow );
5851 %}
5852 
5853 // --------------------------------- DIV --------------------------------------
5854 
5855 // Floats vector div
5856 instruct vdivF(vec dst, vec src) %{
5857   predicate(UseAVX == 0);
5858   match(Set dst (DivVF dst src));
5859   format %{ "divps   $dst,$src\t! div packedF" %}
5860   ins_encode %{
5861     __ divps($dst$$XMMRegister, $src$$XMMRegister);
5862   %}
5863   ins_pipe( pipe_slow );
5864 %}
5865 
5866 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
5867   predicate(UseAVX > 0);
5868   match(Set dst (DivVF src1 src2));
5869   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
5870   ins_encode %{
5871     int vlen_enc = vector_length_encoding(this);
5872     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5873   %}
5874   ins_pipe( pipe_slow );
5875 %}
5876 
5877 instruct vdivF_mem(vec dst, vec src, memory mem) %{
5878   predicate((UseAVX > 0) &&
5879             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5880   match(Set dst (DivVF src (LoadVector mem)));
5881   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
5882   ins_encode %{
5883     int vlen_enc = vector_length_encoding(this);
5884     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5885   %}
5886   ins_pipe( pipe_slow );
5887 %}
5888 
5889 // Doubles vector div
5890 instruct vdivD(vec dst, vec src) %{
5891   predicate(UseAVX == 0);
5892   match(Set dst (DivVD dst src));
5893   format %{ "divpd   $dst,$src\t! div packedD" %}
5894   ins_encode %{
5895     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
5896   %}
5897   ins_pipe( pipe_slow );
5898 %}
5899 
5900 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
5901   predicate(UseAVX > 0);
5902   match(Set dst (DivVD src1 src2));
5903   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
5904   ins_encode %{
5905     int vlen_enc = vector_length_encoding(this);
5906     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5907   %}
5908   ins_pipe( pipe_slow );
5909 %}
5910 
5911 instruct vdivD_mem(vec dst, vec src, memory mem) %{
5912   predicate((UseAVX > 0) &&
5913             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5914   match(Set dst (DivVD src (LoadVector mem)));
5915   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
5916   ins_encode %{
5917     int vlen_enc = vector_length_encoding(this);
5918     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5919   %}
5920   ins_pipe( pipe_slow );
5921 %}
5922 
5923 // ------------------------------ MinMax ---------------------------------------
5924 
5925 // Byte, Short, Int vector Min/Max
5926 instruct minmax_reg_sse(vec dst, vec src) %{
5927   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5928             UseAVX == 0);
5929   match(Set dst (MinV dst src));
5930   match(Set dst (MaxV dst src));
5931   format %{ "vector_minmax  $dst,$src\t!  " %}
5932   ins_encode %{
5933     assert(UseSSE >= 4, "required");
5934 
5935     int opcode = this->ideal_Opcode();
5936     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5937     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
5938   %}
5939   ins_pipe( pipe_slow );
5940 %}
5941 
5942 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
5943   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5944             UseAVX > 0);
5945   match(Set dst (MinV src1 src2));
5946   match(Set dst (MaxV src1 src2));
5947   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
5948   ins_encode %{
5949     int opcode = this->ideal_Opcode();
5950     int vlen_enc = vector_length_encoding(this);
5951     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5952 
5953     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5954   %}
5955   ins_pipe( pipe_slow );
5956 %}
5957 
5958 // Long vector Min/Max
5959 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
5960   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
5961             UseAVX == 0);
5962   match(Set dst (MinV dst src));
5963   match(Set dst (MaxV src dst));
5964   effect(TEMP dst, TEMP tmp);
5965   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
5966   ins_encode %{
5967     assert(UseSSE >= 4, "required");
5968 
5969     int opcode = this->ideal_Opcode();
5970     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5971     assert(elem_bt == T_LONG, "sanity");
5972 
5973     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
5974   %}
5975   ins_pipe( pipe_slow );
5976 %}
5977 
5978 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
5979   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
5980             UseAVX > 0 && !VM_Version::supports_avx512vl());
5981   match(Set dst (MinV src1 src2));
5982   match(Set dst (MaxV src1 src2));
5983   effect(TEMP dst);
5984   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
5985   ins_encode %{
5986     int vlen_enc = vector_length_encoding(this);
5987     int opcode = this->ideal_Opcode();
5988     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5989     assert(elem_bt == T_LONG, "sanity");
5990 
5991     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5992   %}
5993   ins_pipe( pipe_slow );
5994 %}
5995 
5996 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
5997   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
5998             Matcher::vector_element_basic_type(n) == T_LONG);
5999   match(Set dst (MinV src1 src2));
6000   match(Set dst (MaxV src1 src2));
6001   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
6002   ins_encode %{
6003     assert(UseAVX > 2, "required");
6004 
6005     int vlen_enc = vector_length_encoding(this);
6006     int opcode = this->ideal_Opcode();
6007     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6008     assert(elem_bt == T_LONG, "sanity");
6009 
6010     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6011   %}
6012   ins_pipe( pipe_slow );
6013 %}
6014 
6015 // Float/Double vector Min/Max
6016 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
6017   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
6018             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
6019             UseAVX > 0);
6020   match(Set dst (MinV a b));
6021   match(Set dst (MaxV a b));
6022   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
6023   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
6024   ins_encode %{
6025     assert(UseAVX > 0, "required");
6026 
6027     int opcode = this->ideal_Opcode();
6028     int vlen_enc = vector_length_encoding(this);
6029     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6030 
6031     __ vminmax_fp(opcode, elem_bt,
6032                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
6033                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
6034   %}
6035   ins_pipe( pipe_slow );
6036 %}
6037 
6038 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
6039   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
6040             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
6041   match(Set dst (MinV a b));
6042   match(Set dst (MaxV a b));
6043   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
6044   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
6045   ins_encode %{
6046     assert(UseAVX > 2, "required");
6047 
6048     int opcode = this->ideal_Opcode();
6049     int vlen_enc = vector_length_encoding(this);
6050     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6051 
6052     __ evminmax_fp(opcode, elem_bt,
6053                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
6054                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
6055   %}
6056   ins_pipe( pipe_slow );
6057 %}
6058 
6059 // --------------------------------- Signum/CopySign ---------------------------
6060 
6061 instruct signumF_reg(regF dst, regF zero, regF one, rRegP scratch, rFlagsReg cr) %{
6062   match(Set dst (SignumF dst (Binary zero one)));
6063   effect(TEMP scratch, KILL cr);
6064   format %{ "signumF $dst, $dst\t! using $scratch as TEMP" %}
6065   ins_encode %{
6066     int opcode = this->ideal_Opcode();
6067     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
6068   %}
6069   ins_pipe( pipe_slow );
6070 %}
6071 
6072 instruct signumD_reg(regD dst, regD zero, regD one, rRegP scratch, rFlagsReg cr) %{
6073   match(Set dst (SignumD dst (Binary zero one)));
6074   effect(TEMP scratch, KILL cr);
6075   format %{ "signumD $dst, $dst\t! using $scratch as TEMP" %}
6076   ins_encode %{
6077     int opcode = this->ideal_Opcode();
6078     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
6079   %}
6080   ins_pipe( pipe_slow );
6081 %}
6082 
6083 // ---------------------------------------
6084 // For copySign use 0xE4 as writemask for vpternlog
6085 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
6086 // C (xmm2) is set to 0x7FFFFFFF
6087 // Wherever xmm2 is 0, we want to pick from B (sign)
6088 // Wherever xmm2 is 1, we want to pick from A (src)
6089 //
6090 // A B C Result
6091 // 0 0 0 0
6092 // 0 0 1 0
6093 // 0 1 0 1
6094 // 0 1 1 0
6095 // 1 0 0 0
6096 // 1 0 1 1
6097 // 1 1 0 1
6098 // 1 1 1 1
6099 //
6100 // Result going from high bit to low bit is 0x11100100 = 0xe4
6101 // ---------------------------------------
6102 
6103 #ifdef _LP64
6104 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
6105   match(Set dst (CopySignF dst src));
6106   effect(TEMP tmp1, TEMP tmp2);
6107   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
6108   ins_encode %{
6109     __ movl($tmp2$$Register, 0x7FFFFFFF);
6110     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
6111     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
6112   %}
6113   ins_pipe( pipe_slow );
6114 %}
6115 
6116 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
6117   match(Set dst (CopySignD dst (Binary src zero)));
6118   ins_cost(100);
6119   effect(TEMP tmp1, TEMP tmp2);
6120   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
6121   ins_encode %{
6122     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
6123     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
6124     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
6125   %}
6126   ins_pipe( pipe_slow );
6127 %}
6128 #endif // _LP64
6129 
6130 // --------------------------------- Sqrt --------------------------------------
6131 
6132 instruct vsqrtF_reg(vec dst, vec src) %{
6133   match(Set dst (SqrtVF src));
6134   ins_cost(400);
6135   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
6136   ins_encode %{
6137     assert(UseAVX > 0, "required");
6138     int vlen_enc = vector_length_encoding(this);
6139     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6140   %}
6141   ins_pipe( pipe_slow );
6142 %}
6143 
6144 instruct vsqrtF_mem(vec dst, memory mem) %{
6145   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
6146   match(Set dst (SqrtVF (LoadVector mem)));
6147   ins_cost(400);
6148   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
6149   ins_encode %{
6150     assert(UseAVX > 0, "required");
6151     int vlen_enc = vector_length_encoding(this);
6152     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
6153   %}
6154   ins_pipe( pipe_slow );
6155 %}
6156 
6157 // Floating point vector sqrt
6158 instruct vsqrtD_reg(vec dst, vec src) %{
6159   match(Set dst (SqrtVD src));
6160   ins_cost(400);
6161   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
6162   ins_encode %{
6163     assert(UseAVX > 0, "required");
6164     int vlen_enc = vector_length_encoding(this);
6165     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6166   %}
6167   ins_pipe( pipe_slow );
6168 %}
6169 
6170 instruct vsqrtD_mem(vec dst, memory mem) %{
6171   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
6172   match(Set dst (SqrtVD (LoadVector mem)));
6173   ins_cost(400);
6174   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
6175   ins_encode %{
6176     assert(UseAVX > 0, "required");
6177     int vlen_enc = vector_length_encoding(this);
6178     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
6179   %}
6180   ins_pipe( pipe_slow );
6181 %}
6182 
6183 // ------------------------------ Shift ---------------------------------------
6184 
6185 // Left and right shift count vectors are the same on x86
6186 // (only lowest bits of xmm reg are used for count).
6187 instruct vshiftcnt(vec dst, rRegI cnt) %{
6188   match(Set dst (LShiftCntV cnt));
6189   match(Set dst (RShiftCntV cnt));
6190   format %{ "movdl    $dst,$cnt\t! load shift count" %}
6191   ins_encode %{
6192     __ movdl($dst$$XMMRegister, $cnt$$Register);
6193   %}
6194   ins_pipe( pipe_slow );
6195 %}
6196 
6197 // Byte vector shift
6198 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6199   predicate(Matcher::vector_length(n) <= 8 && VectorNode::is_vshift_cnt(n->in(2)));
6200   match(Set dst ( LShiftVB src shift));
6201   match(Set dst ( RShiftVB src shift));
6202   match(Set dst (URShiftVB src shift));
6203   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
6204   format %{"vector_byte_shift $dst,$src,$shift" %}
6205   ins_encode %{
6206     assert(UseSSE > 3, "required");
6207     int opcode = this->ideal_Opcode();
6208     bool sign = (opcode != Op_URShiftVB);
6209     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
6210     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
6211     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6212     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
6213     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
6214   %}
6215   ins_pipe( pipe_slow );
6216 %}
6217 
6218 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
6219   predicate(Matcher::vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) &&
6220             UseAVX <= 1);
6221   match(Set dst ( LShiftVB src shift));
6222   match(Set dst ( RShiftVB src shift));
6223   match(Set dst (URShiftVB src shift));
6224   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
6225   format %{"vector_byte_shift $dst,$src,$shift" %}
6226   ins_encode %{
6227     assert(UseSSE > 3, "required");
6228     int opcode = this->ideal_Opcode();
6229     bool sign = (opcode != Op_URShiftVB);
6230     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
6231     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
6232     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
6233     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
6234     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
6235     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6236     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
6237     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
6238     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
6239   %}
6240   ins_pipe( pipe_slow );
6241 %}
6242 
6243 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6244   predicate(Matcher::vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) &&
6245             UseAVX > 1);
6246   match(Set dst ( LShiftVB src shift));
6247   match(Set dst ( RShiftVB src shift));
6248   match(Set dst (URShiftVB src shift));
6249   effect(TEMP dst, TEMP tmp, TEMP scratch);
6250   format %{"vector_byte_shift $dst,$src,$shift" %}
6251   ins_encode %{
6252     int opcode = this->ideal_Opcode();
6253     bool sign = (opcode != Op_URShiftVB);
6254     int vlen_enc = Assembler::AVX_256bit;
6255     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
6256     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6257     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6258     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
6259     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
6260   %}
6261   ins_pipe( pipe_slow );
6262 %}
6263 
6264 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6265   predicate(Matcher::vector_length(n) == 32 && VectorNode::is_vshift_cnt(n->in(2)));
6266   match(Set dst ( LShiftVB src shift));
6267   match(Set dst ( RShiftVB src shift));
6268   match(Set dst (URShiftVB src shift));
6269   effect(TEMP dst, TEMP tmp, TEMP scratch);
6270   format %{"vector_byte_shift $dst,$src,$shift" %}
6271   ins_encode %{
6272     assert(UseAVX > 1, "required");
6273     int opcode = this->ideal_Opcode();
6274     bool sign = (opcode != Op_URShiftVB);
6275     int vlen_enc = Assembler::AVX_256bit;
6276     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
6277     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6278     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6279     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6280     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6281     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6282     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6283     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6284     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6285   %}
6286   ins_pipe( pipe_slow );
6287 %}
6288 
6289 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
6290   predicate(Matcher::vector_length(n) == 64 && VectorNode::is_vshift_cnt(n->in(2)));
6291   match(Set dst ( LShiftVB src shift));
6292   match(Set dst  (RShiftVB src shift));
6293   match(Set dst (URShiftVB src shift));
6294   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
6295   format %{"vector_byte_shift $dst,$src,$shift" %}
6296   ins_encode %{
6297     assert(UseAVX > 2, "required");
6298     int opcode = this->ideal_Opcode();
6299     bool sign = (opcode != Op_URShiftVB);
6300     int vlen_enc = Assembler::AVX_512bit;
6301     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
6302     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
6303     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6304     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6305     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6306     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6307     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6308     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6309     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6310     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
6311     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
6312     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6313   %}
6314   ins_pipe( pipe_slow );
6315 %}
6316 
6317 // Shorts vector logical right shift produces incorrect Java result
6318 // for negative data because java code convert short value into int with
6319 // sign extension before a shift. But char vectors are fine since chars are
6320 // unsigned values.
6321 // Shorts/Chars vector left shift
6322 instruct vshiftS(vec dst, vec src, vec shift) %{
6323   predicate(VectorNode::is_vshift_cnt(n->in(2)));
6324   match(Set dst ( LShiftVS src shift));
6325   match(Set dst ( RShiftVS src shift));
6326   match(Set dst (URShiftVS src shift));
6327   effect(TEMP dst, USE src, USE shift);
6328   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
6329   ins_encode %{
6330     int opcode = this->ideal_Opcode();
6331     if (UseAVX > 0) {
6332       int vlen_enc = vector_length_encoding(this);
6333       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6334     } else {
6335       int vlen = Matcher::vector_length(this);
6336       if (vlen == 2) {
6337         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6338         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6339       } else if (vlen == 4) {
6340         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6341         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6342       } else {
6343         assert (vlen == 8, "sanity");
6344         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6345         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6346       }
6347     }
6348   %}
6349   ins_pipe( pipe_slow );
6350 %}
6351 
6352 // Integers vector left shift
6353 instruct vshiftI(vec dst, vec src, vec shift) %{
6354   predicate(VectorNode::is_vshift_cnt(n->in(2)));
6355   match(Set dst ( LShiftVI src shift));
6356   match(Set dst ( RShiftVI src shift));
6357   match(Set dst (URShiftVI src shift));
6358   effect(TEMP dst, USE src, USE shift);
6359   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
6360   ins_encode %{
6361     int opcode = this->ideal_Opcode();
6362     if (UseAVX > 0) {
6363       int vlen_enc = vector_length_encoding(this);
6364       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6365     } else {
6366       int vlen = Matcher::vector_length(this);
6367       if (vlen == 2) {
6368         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6369         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6370       } else {
6371         assert(vlen == 4, "sanity");
6372         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6373         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6374       }
6375     }
6376   %}
6377   ins_pipe( pipe_slow );
6378 %}
6379 
6380 // Integers vector left constant shift
6381 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
6382   match(Set dst (LShiftVI src (LShiftCntV shift)));
6383   match(Set dst (RShiftVI src (RShiftCntV shift)));
6384   match(Set dst (URShiftVI src (RShiftCntV shift)));
6385   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
6386   ins_encode %{
6387     int opcode = this->ideal_Opcode();
6388     if (UseAVX > 0) {
6389       int vector_len = vector_length_encoding(this);
6390       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6391     } else {
6392       int vlen = Matcher::vector_length(this);
6393       if (vlen == 2) {
6394         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6395         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6396       } else {
6397         assert(vlen == 4, "sanity");
6398         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6399         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6400       }
6401     }
6402   %}
6403   ins_pipe( pipe_slow );
6404 %}
6405 
6406 // Longs vector shift
6407 instruct vshiftL(vec dst, vec src, vec shift) %{
6408   predicate(VectorNode::is_vshift_cnt(n->in(2)));
6409   match(Set dst ( LShiftVL src shift));
6410   match(Set dst (URShiftVL src shift));
6411   effect(TEMP dst, USE src, USE shift);
6412   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
6413   ins_encode %{
6414     int opcode = this->ideal_Opcode();
6415     if (UseAVX > 0) {
6416       int vlen_enc = vector_length_encoding(this);
6417       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6418     } else {
6419       assert(Matcher::vector_length(this) == 2, "");
6420       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6421       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6422     }
6423   %}
6424   ins_pipe( pipe_slow );
6425 %}
6426 
6427 // Longs vector constant shift
6428 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
6429   match(Set dst (LShiftVL src (LShiftCntV shift)));
6430   match(Set dst (URShiftVL src (RShiftCntV shift)));
6431   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
6432   ins_encode %{
6433     int opcode = this->ideal_Opcode();
6434     if (UseAVX > 0) {
6435       int vector_len = vector_length_encoding(this);
6436       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6437     } else {
6438       assert(Matcher::vector_length(this) == 2, "");
6439       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6440       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6441     }
6442   %}
6443   ins_pipe( pipe_slow );
6444 %}
6445 
6446 // -------------------ArithmeticRightShift -----------------------------------
6447 // Long vector arithmetic right shift
6448 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6449   predicate(VectorNode::is_vshift_cnt(n->in(2)) && UseAVX <= 2);
6450   match(Set dst (RShiftVL src shift));
6451   effect(TEMP dst, TEMP tmp, TEMP scratch);
6452   format %{ "vshiftq $dst,$src,$shift" %}
6453   ins_encode %{
6454     uint vlen = Matcher::vector_length(this);
6455     if (vlen == 2) {
6456       assert(UseSSE >= 2, "required");
6457       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6458       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
6459       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6460       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
6461       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
6462       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
6463     } else {
6464       assert(vlen == 4, "sanity");
6465       assert(UseAVX > 1, "required");
6466       int vlen_enc = Assembler::AVX_256bit;
6467       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6468       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6469       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6470       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6471       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6472     }
6473   %}
6474   ins_pipe( pipe_slow );
6475 %}
6476 
6477 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
6478   predicate(VectorNode::is_vshift_cnt(n->in(2)) && UseAVX > 2);
6479   match(Set dst (RShiftVL src shift));
6480   format %{ "vshiftq $dst,$src,$shift" %}
6481   ins_encode %{
6482     int vlen_enc = vector_length_encoding(this);
6483     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6484   %}
6485   ins_pipe( pipe_slow );
6486 %}
6487 
6488 // ------------------- Variable Shift -----------------------------
6489 // Byte variable shift
6490 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6491   predicate(Matcher::vector_length(n) <= 8 &&
6492             !VectorNode::is_vshift_cnt(n->in(2)) &&
6493             !VM_Version::supports_avx512bw());
6494   match(Set dst ( LShiftVB src shift));
6495   match(Set dst ( RShiftVB src shift));
6496   match(Set dst (URShiftVB src shift));
6497   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6498   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6499   ins_encode %{
6500     assert(UseAVX >= 2, "required");
6501 
6502     int opcode = this->ideal_Opcode();
6503     int vlen_enc = Assembler::AVX_128bit;
6504     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6505     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6506   %}
6507   ins_pipe( pipe_slow );
6508 %}
6509 
6510 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6511   predicate(Matcher::vector_length(n) == 16 &&
6512             !VectorNode::is_vshift_cnt(n->in(2)) &&
6513             !VM_Version::supports_avx512bw());
6514   match(Set dst ( LShiftVB src shift));
6515   match(Set dst ( RShiftVB src shift));
6516   match(Set dst (URShiftVB src shift));
6517   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6518   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6519   ins_encode %{
6520     assert(UseAVX >= 2, "required");
6521 
6522     int opcode = this->ideal_Opcode();
6523     int vlen_enc = Assembler::AVX_128bit;
6524     // Shift lower half and get word result in dst
6525     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6526 
6527     // Shift upper half and get word result in vtmp1
6528     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6529     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6530     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6531 
6532     // Merge and down convert the two word results to byte in dst
6533     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6534   %}
6535   ins_pipe( pipe_slow );
6536 %}
6537 
6538 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{
6539   predicate(Matcher::vector_length(n) == 32 &&
6540             !VectorNode::is_vshift_cnt(n->in(2)) &&
6541             !VM_Version::supports_avx512bw());
6542   match(Set dst ( LShiftVB src shift));
6543   match(Set dst ( RShiftVB src shift));
6544   match(Set dst (URShiftVB src shift));
6545   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch);
6546   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %}
6547   ins_encode %{
6548     assert(UseAVX >= 2, "required");
6549 
6550     int opcode = this->ideal_Opcode();
6551     int vlen_enc = Assembler::AVX_128bit;
6552     // Process lower 128 bits and get result in dst
6553     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6554     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6555     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6556     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6557     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6558 
6559     // Process higher 128 bits and get result in vtmp3
6560     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6561     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6562     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register);
6563     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
6564     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
6565     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6566     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
6567 
6568     // Merge the two results in dst
6569     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6570   %}
6571   ins_pipe( pipe_slow );
6572 %}
6573 
6574 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6575   predicate(Matcher::vector_length(n) <= 32 &&
6576             !VectorNode::is_vshift_cnt(n->in(2)) &&
6577             VM_Version::supports_avx512bw());
6578   match(Set dst ( LShiftVB src shift));
6579   match(Set dst ( RShiftVB src shift));
6580   match(Set dst (URShiftVB src shift));
6581   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6582   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6583   ins_encode %{
6584     assert(UseAVX > 2, "required");
6585 
6586     int opcode = this->ideal_Opcode();
6587     int vlen_enc = vector_length_encoding(this);
6588     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6589   %}
6590   ins_pipe( pipe_slow );
6591 %}
6592 
6593 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6594   predicate(Matcher::vector_length(n) == 64 &&
6595             !VectorNode::is_vshift_cnt(n->in(2)) &&
6596             VM_Version::supports_avx512bw());
6597   match(Set dst ( LShiftVB src shift));
6598   match(Set dst ( RShiftVB src shift));
6599   match(Set dst (URShiftVB src shift));
6600   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6601   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6602   ins_encode %{
6603     assert(UseAVX > 2, "required");
6604 
6605     int opcode = this->ideal_Opcode();
6606     int vlen_enc = Assembler::AVX_256bit;
6607     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6608     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6609     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6610     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6611     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6612   %}
6613   ins_pipe( pipe_slow );
6614 %}
6615 
6616 // Short variable shift
6617 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6618   predicate(Matcher::vector_length(n) <= 8 &&
6619             !VectorNode::is_vshift_cnt(n->in(2)) &&
6620             !VM_Version::supports_avx512bw());
6621   match(Set dst ( LShiftVS src shift));
6622   match(Set dst ( RShiftVS src shift));
6623   match(Set dst (URShiftVS src shift));
6624   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6625   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6626   ins_encode %{
6627     assert(UseAVX >= 2, "required");
6628 
6629     int opcode = this->ideal_Opcode();
6630     bool sign = (opcode != Op_URShiftVS);
6631     int vlen_enc = Assembler::AVX_256bit;
6632     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
6633     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
6634     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
6635     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6636     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
6637     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6638   %}
6639   ins_pipe( pipe_slow );
6640 %}
6641 
6642 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6643   predicate(Matcher::vector_length(n) == 16 &&
6644             !VectorNode::is_vshift_cnt(n->in(2)) &&
6645             !VM_Version::supports_avx512bw());
6646   match(Set dst ( LShiftVS src shift));
6647   match(Set dst ( RShiftVS src shift));
6648   match(Set dst (URShiftVS src shift));
6649   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6650   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6651   ins_encode %{
6652     assert(UseAVX >= 2, "required");
6653 
6654     int opcode = this->ideal_Opcode();
6655     bool sign = (opcode != Op_URShiftVS);
6656     int vlen_enc = Assembler::AVX_256bit;
6657     // Shift lower half, with result in vtmp2 usign vtmp1 as TEMP
6658     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6659     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6660     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6661     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6662 
6663     // Shift upper half, with result in dst usign vtmp1 as TEMP
6664     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
6665     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
6666     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6667     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6668     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6669     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6670 
6671     // Merge lower and upper half result into dst
6672     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6673     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6674   %}
6675   ins_pipe( pipe_slow );
6676 %}
6677 
6678 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
6679   predicate(!VectorNode::is_vshift_cnt(n->in(2)) &&
6680             VM_Version::supports_avx512bw());
6681   match(Set dst ( LShiftVS src shift));
6682   match(Set dst ( RShiftVS src shift));
6683   match(Set dst (URShiftVS src shift));
6684   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
6685   ins_encode %{
6686     assert(UseAVX > 2, "required");
6687 
6688     int opcode = this->ideal_Opcode();
6689     int vlen_enc = vector_length_encoding(this);
6690     if (!VM_Version::supports_avx512vl()) {
6691       vlen_enc = Assembler::AVX_512bit;
6692     }
6693     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6694   %}
6695   ins_pipe( pipe_slow );
6696 %}
6697 
6698 //Integer variable shift
6699 instruct vshiftI_var(vec dst, vec src, vec shift) %{
6700   predicate(!VectorNode::is_vshift_cnt(n->in(2)));
6701   match(Set dst ( LShiftVI src shift));
6702   match(Set dst ( RShiftVI src shift));
6703   match(Set dst (URShiftVI src shift));
6704   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
6705   ins_encode %{
6706     assert(UseAVX >= 2, "required");
6707 
6708     int opcode = this->ideal_Opcode();
6709     int vlen_enc = vector_length_encoding(this);
6710     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6711   %}
6712   ins_pipe( pipe_slow );
6713 %}
6714 
6715 //Long variable shift
6716 instruct vshiftL_var(vec dst, vec src, vec shift) %{
6717   predicate(!VectorNode::is_vshift_cnt(n->in(2)));
6718   match(Set dst ( LShiftVL src shift));
6719   match(Set dst (URShiftVL src shift));
6720   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
6721   ins_encode %{
6722     assert(UseAVX >= 2, "required");
6723 
6724     int opcode = this->ideal_Opcode();
6725     int vlen_enc = vector_length_encoding(this);
6726     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6727   %}
6728   ins_pipe( pipe_slow );
6729 %}
6730 
6731 //Long variable right shift arithmetic
6732 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
6733   predicate(Matcher::vector_length(n) <= 4 &&
6734             !VectorNode::is_vshift_cnt(n->in(2)) &&
6735             UseAVX == 2);
6736   match(Set dst (RShiftVL src shift));
6737   effect(TEMP dst, TEMP vtmp);
6738   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
6739   ins_encode %{
6740     int opcode = this->ideal_Opcode();
6741     int vlen_enc = vector_length_encoding(this);
6742     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
6743                  $vtmp$$XMMRegister);
6744   %}
6745   ins_pipe( pipe_slow );
6746 %}
6747 
6748 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
6749   predicate(!VectorNode::is_vshift_cnt(n->in(2)) &&
6750             UseAVX > 2);
6751   match(Set dst (RShiftVL src shift));
6752   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
6753   ins_encode %{
6754     int opcode = this->ideal_Opcode();
6755     int vlen_enc = vector_length_encoding(this);
6756     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6757   %}
6758   ins_pipe( pipe_slow );
6759 %}
6760 
6761 // --------------------------------- AND --------------------------------------
6762 
6763 instruct vand(vec dst, vec src) %{
6764   predicate(UseAVX == 0);
6765   match(Set dst (AndV dst src));
6766   format %{ "pand    $dst,$src\t! and vectors" %}
6767   ins_encode %{
6768     __ pand($dst$$XMMRegister, $src$$XMMRegister);
6769   %}
6770   ins_pipe( pipe_slow );
6771 %}
6772 
6773 instruct vand_reg(vec dst, vec src1, vec src2) %{
6774   predicate(UseAVX > 0);
6775   match(Set dst (AndV src1 src2));
6776   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
6777   ins_encode %{
6778     int vlen_enc = vector_length_encoding(this);
6779     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6780   %}
6781   ins_pipe( pipe_slow );
6782 %}
6783 
6784 instruct vand_mem(vec dst, vec src, memory mem) %{
6785   predicate((UseAVX > 0) &&
6786             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6787   match(Set dst (AndV src (LoadVector mem)));
6788   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
6789   ins_encode %{
6790     int vlen_enc = vector_length_encoding(this);
6791     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6792   %}
6793   ins_pipe( pipe_slow );
6794 %}
6795 
6796 // --------------------------------- OR ---------------------------------------
6797 
6798 instruct vor(vec dst, vec src) %{
6799   predicate(UseAVX == 0);
6800   match(Set dst (OrV dst src));
6801   format %{ "por     $dst,$src\t! or vectors" %}
6802   ins_encode %{
6803     __ por($dst$$XMMRegister, $src$$XMMRegister);
6804   %}
6805   ins_pipe( pipe_slow );
6806 %}
6807 
6808 instruct vor_reg(vec dst, vec src1, vec src2) %{
6809   predicate(UseAVX > 0);
6810   match(Set dst (OrV src1 src2));
6811   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
6812   ins_encode %{
6813     int vlen_enc = vector_length_encoding(this);
6814     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6815   %}
6816   ins_pipe( pipe_slow );
6817 %}
6818 
6819 instruct vor_mem(vec dst, vec src, memory mem) %{
6820   predicate((UseAVX > 0) &&
6821             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6822   match(Set dst (OrV src (LoadVector mem)));
6823   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
6824   ins_encode %{
6825     int vlen_enc = vector_length_encoding(this);
6826     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6827   %}
6828   ins_pipe( pipe_slow );
6829 %}
6830 
6831 // --------------------------------- XOR --------------------------------------
6832 
6833 instruct vxor(vec dst, vec src) %{
6834   predicate(UseAVX == 0);
6835   match(Set dst (XorV dst src));
6836   format %{ "pxor    $dst,$src\t! xor vectors" %}
6837   ins_encode %{
6838     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
6839   %}
6840   ins_pipe( pipe_slow );
6841 %}
6842 
6843 instruct vxor_reg(vec dst, vec src1, vec src2) %{
6844   predicate(UseAVX > 0);
6845   match(Set dst (XorV src1 src2));
6846   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
6847   ins_encode %{
6848     int vlen_enc = vector_length_encoding(this);
6849     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6850   %}
6851   ins_pipe( pipe_slow );
6852 %}
6853 
6854 instruct vxor_mem(vec dst, vec src, memory mem) %{
6855   predicate((UseAVX > 0) &&
6856             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6857   match(Set dst (XorV src (LoadVector mem)));
6858   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
6859   ins_encode %{
6860     int vlen_enc = vector_length_encoding(this);
6861     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6862   %}
6863   ins_pipe( pipe_slow );
6864 %}
6865 
6866 // --------------------------------- VectorCast --------------------------------------
6867 
6868 instruct vcastBtoX(vec dst, vec src) %{
6869   match(Set dst (VectorCastB2X src));
6870   format %{ "vector_cast_b2x $dst,$src\t!" %}
6871   ins_encode %{
6872     assert(UseAVX > 0, "required");
6873 
6874     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6875     int vlen_enc = vector_length_encoding(this);
6876     switch (to_elem_bt) {
6877       case T_SHORT:
6878         __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6879         break;
6880       case T_INT:
6881         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6882         break;
6883       case T_FLOAT:
6884         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6885         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6886         break;
6887       case T_LONG:
6888         __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6889         break;
6890       case T_DOUBLE:
6891         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6892         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6893         break;
6894 
6895       default: assert(false, "%s", type2name(to_elem_bt));
6896     }
6897   %}
6898   ins_pipe( pipe_slow );
6899 %}
6900 
6901 instruct castStoX(vec dst, vec src, rRegP scratch) %{
6902   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6903             Matcher::vector_length(n->in(1)) <= 8 && // src
6904             Matcher::vector_element_basic_type(n) == T_BYTE);
6905   effect(TEMP scratch);
6906   match(Set dst (VectorCastS2X src));
6907   format %{ "vector_cast_s2x $dst,$src\t! using $scratch as TEMP" %}
6908   ins_encode %{
6909     assert(UseAVX > 0, "required");
6910 
6911     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, $scratch$$Register);
6912     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6913   %}
6914   ins_pipe( pipe_slow );
6915 %}
6916 
6917 instruct vcastStoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6918   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6919             Matcher::vector_length(n->in(1)) == 16 && // src
6920             Matcher::vector_element_basic_type(n) == T_BYTE);
6921   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6922   match(Set dst (VectorCastS2X src));
6923   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp, $scratch as TEMP" %}
6924   ins_encode %{
6925     assert(UseAVX > 0, "required");
6926 
6927     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
6928     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6929     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
6930     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6931   %}
6932   ins_pipe( pipe_slow );
6933 %}
6934 
6935 instruct vcastStoX_evex(vec dst, vec src) %{
6936   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
6937             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
6938   match(Set dst (VectorCastS2X src));
6939   format %{ "vector_cast_s2x $dst,$src\t!" %}
6940   ins_encode %{
6941     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6942     int src_vlen_enc = vector_length_encoding(this, $src);
6943     int vlen_enc = vector_length_encoding(this);
6944     switch (to_elem_bt) {
6945       case T_BYTE:
6946         if (!VM_Version::supports_avx512vl()) {
6947           vlen_enc = Assembler::AVX_512bit;
6948         }
6949         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6950         break;
6951       case T_INT:
6952         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6953         break;
6954       case T_FLOAT:
6955         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6956         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6957         break;
6958       case T_LONG:
6959         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6960         break;
6961       case T_DOUBLE:
6962         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6963         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6964         break;
6965       default:
6966         ShouldNotReachHere();
6967     }
6968   %}
6969   ins_pipe( pipe_slow );
6970 %}
6971 
6972 instruct castItoX(vec dst, vec src, rRegP scratch) %{
6973   predicate(UseAVX <= 2 &&
6974             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
6975             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
6976   match(Set dst (VectorCastI2X src));
6977   format %{ "vector_cast_i2x $dst,$src\t! using $scratch as TEMP" %}
6978   effect(TEMP scratch);
6979   ins_encode %{
6980     assert(UseAVX > 0, "required");
6981 
6982     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6983     int vlen_enc = vector_length_encoding(this, $src);
6984 
6985     if (to_elem_bt == T_BYTE) {
6986       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
6987       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6988       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6989     } else {
6990       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
6991       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6992       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6993     }
6994   %}
6995   ins_pipe( pipe_slow );
6996 %}
6997 
6998 instruct vcastItoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6999   predicate(UseAVX <= 2 &&
7000             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
7001             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
7002   match(Set dst (VectorCastI2X src));
7003   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp and $scratch as TEMP" %}
7004   effect(TEMP dst, TEMP vtmp, TEMP scratch);
7005   ins_encode %{
7006     assert(UseAVX > 0, "required");
7007 
7008     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7009     int vlen_enc = vector_length_encoding(this, $src);
7010 
7011     if (to_elem_bt == T_BYTE) {
7012       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
7013       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
7014       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7015       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7016     } else {
7017       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
7018       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
7019       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
7020       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7021     }
7022   %}
7023   ins_pipe( pipe_slow );
7024 %}
7025 
7026 instruct vcastItoX_evex(vec dst, vec src) %{
7027   predicate(UseAVX > 2 ||
7028             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
7029   match(Set dst (VectorCastI2X src));
7030   format %{ "vector_cast_i2x $dst,$src\t!" %}
7031   ins_encode %{
7032     assert(UseAVX > 0, "required");
7033 
7034     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
7035     int src_vlen_enc = vector_length_encoding(this, $src);
7036     int dst_vlen_enc = vector_length_encoding(this);
7037     switch (dst_elem_bt) {
7038       case T_BYTE:
7039         if (!VM_Version::supports_avx512vl()) {
7040           src_vlen_enc = Assembler::AVX_512bit;
7041         }
7042         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7043         break;
7044       case T_SHORT:
7045         if (!VM_Version::supports_avx512vl()) {
7046           src_vlen_enc = Assembler::AVX_512bit;
7047         }
7048         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7049         break;
7050       case T_FLOAT:
7051         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7052         break;
7053       case T_LONG:
7054         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7055         break;
7056       case T_DOUBLE:
7057         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7058         break;
7059       default:
7060         ShouldNotReachHere();
7061     }
7062   %}
7063   ins_pipe( pipe_slow );
7064 %}
7065 
7066 instruct vcastLtoBS(vec dst, vec src, rRegP scratch) %{
7067   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
7068             UseAVX <= 2);
7069   match(Set dst (VectorCastL2X src));
7070   effect(TEMP scratch);
7071   format %{ "vector_cast_l2x  $dst,$src\t! using $scratch as TEMP" %}
7072   ins_encode %{
7073     assert(UseAVX > 0, "required");
7074 
7075     int vlen = Matcher::vector_length_in_bytes(this, $src);
7076     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
7077     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
7078                                                       : ExternalAddress(vector_int_to_short_mask());
7079     if (vlen <= 16) {
7080       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
7081       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
7082       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7083     } else {
7084       assert(vlen <= 32, "required");
7085       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
7086       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
7087       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
7088       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7089     }
7090     if (to_elem_bt == T_BYTE) {
7091       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7092     }
7093   %}
7094   ins_pipe( pipe_slow );
7095 %}
7096 
7097 instruct vcastLtoX_evex(vec dst, vec src) %{
7098   predicate(UseAVX > 2 ||
7099             (Matcher::vector_element_basic_type(n) == T_INT ||
7100              Matcher::vector_element_basic_type(n) == T_FLOAT ||
7101              Matcher::vector_element_basic_type(n) == T_DOUBLE));
7102   match(Set dst (VectorCastL2X src));
7103   format %{ "vector_cast_l2x  $dst,$src\t!" %}
7104   ins_encode %{
7105     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7106     int vlen = Matcher::vector_length_in_bytes(this, $src);
7107     int vlen_enc = vector_length_encoding(this, $src);
7108     switch (to_elem_bt) {
7109       case T_BYTE:
7110         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
7111           vlen_enc = Assembler::AVX_512bit;
7112         }
7113         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7114         break;
7115       case T_SHORT:
7116         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
7117           vlen_enc = Assembler::AVX_512bit;
7118         }
7119         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7120         break;
7121       case T_INT:
7122         if (vlen == 8) {
7123           if ($dst$$XMMRegister != $src$$XMMRegister) {
7124             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
7125           }
7126         } else if (vlen == 16) {
7127           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
7128         } else if (vlen == 32) {
7129           if (UseAVX > 2) {
7130             if (!VM_Version::supports_avx512vl()) {
7131               vlen_enc = Assembler::AVX_512bit;
7132             }
7133             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7134           } else {
7135             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
7136             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
7137           }
7138         } else { // vlen == 64
7139           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7140         }
7141         break;
7142       case T_FLOAT:
7143         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
7144         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7145         break;
7146       case T_DOUBLE:
7147         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
7148         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7149         break;
7150 
7151       default: assert(false, "%s", type2name(to_elem_bt));
7152     }
7153   %}
7154   ins_pipe( pipe_slow );
7155 %}
7156 
7157 instruct vcastFtoD_reg(vec dst, vec src) %{
7158   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
7159   match(Set dst (VectorCastF2X src));
7160   format %{ "vector_cast_f2x  $dst,$src\t!" %}
7161   ins_encode %{
7162     int vlen_enc = vector_length_encoding(this);
7163     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7164   %}
7165   ins_pipe( pipe_slow );
7166 %}
7167 
7168 instruct vcastDtoF_reg(vec dst, vec src) %{
7169   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
7170   match(Set dst (VectorCastD2X src));
7171   format %{ "vector_cast_d2x  $dst,$src\t!" %}
7172   ins_encode %{
7173     int vlen_enc = vector_length_encoding(this, $src);
7174     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7175   %}
7176   ins_pipe( pipe_slow );
7177 %}
7178 
7179 // --------------------------------- VectorMaskCmp --------------------------------------
7180 
7181 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
7182   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7183             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
7184             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7185             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7186   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7187   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
7188   ins_encode %{
7189     int vlen_enc = vector_length_encoding(this, $src1);
7190     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7191     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7192       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7193     } else {
7194       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7195     }
7196   %}
7197   ins_pipe( pipe_slow );
7198 %}
7199 
7200 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
7201   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
7202             n->bottom_type()->isa_vectmask() == NULL &&
7203             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7204   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7205   effect(TEMP scratch, TEMP ktmp);
7206   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7207   ins_encode %{
7208     int vlen_enc = Assembler::AVX_512bit;
7209     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7210     KRegister mask = k0; // The comparison itself is not being masked.
7211     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7212       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7213       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
7214     } else {
7215       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7216       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
7217     }
7218   %}
7219   ins_pipe( pipe_slow );
7220 %}
7221 
7222 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
7223   predicate(n->bottom_type()->isa_vectmask() &&
7224             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7225   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7226   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
7227   ins_encode %{
7228     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
7229     int vlen_enc = vector_length_encoding(this, $src1);
7230     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7231     KRegister mask = k0; // The comparison itself is not being masked.
7232     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7233       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7234     } else {
7235       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7236     }
7237   %}
7238   ins_pipe( pipe_slow );
7239 %}
7240 
7241 instruct vcmp(legVec dst, legVec src1, legVec src2, immI8 cond, rRegP scratch) %{
7242   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7243             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7244             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
7245             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7246             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7247   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7248   effect(TEMP scratch);
7249   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7250   ins_encode %{
7251     int vlen_enc = vector_length_encoding(this, $src1);
7252     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7253     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
7254     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, ww, vlen_enc, $scratch$$Register);
7255   %}
7256   ins_pipe( pipe_slow );
7257 %}
7258 
7259 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
7260   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7261             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7262             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
7263             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 16 && // src1
7264             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7265   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7266   effect(TEMP vtmp1, TEMP vtmp2, TEMP scratch);
7267   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7268   ins_encode %{
7269     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7270     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7271     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
7272     __ vpcmpu(bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen, $vtmp1$$XMMRegister,
7273               $vtmp2$$XMMRegister, $scratch$$Register);
7274   %}
7275   ins_pipe( pipe_slow );
7276 %}
7277 
7278 instruct vcmpu32(legVec dst, legVec src1, legVec src2, immI8 cond, legVec vtmp1, legVec vtmp2, legVec vtmp3, rRegP scratch) %{
7279   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7280             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7281             Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 32 && // src1
7282             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7283   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7284   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP scratch);
7285   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7286   ins_encode %{
7287     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7288     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7289     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
7290     __ vpcmpu32(bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen, $vtmp1$$XMMRegister,
7291                 $vtmp2$$XMMRegister, $vtmp3$$XMMRegister, $scratch$$Register);
7292   %}
7293   ins_pipe( pipe_slow );
7294 %}
7295 
7296 instruct vcmpu64(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
7297   predicate((n->bottom_type()->isa_vectmask() == NULL &&
7298              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
7299              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7300   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7301   effect(TEMP scratch, TEMP ktmp);
7302   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7303   ins_encode %{
7304     assert(UseAVX > 2, "required");
7305 
7306     int vlen_enc = vector_length_encoding(this, $src1);
7307     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7308     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
7309     KRegister mask = k0; // The comparison itself is not being masked.
7310     bool merge = false;
7311     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
7312 
7313     switch (src1_elem_bt) {
7314       case T_INT: {
7315         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7316         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7317         break;
7318       }
7319       case T_LONG: {
7320         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7321         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7322         break;
7323       }
7324       default: assert(false, "%s", type2name(src1_elem_bt));
7325     }
7326   %}
7327   ins_pipe( pipe_slow );
7328 %}
7329 
7330 
7331 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
7332   predicate(n->bottom_type()->isa_vectmask() &&
7333             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7334   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7335   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
7336   ins_encode %{
7337     assert(UseAVX > 2, "required");
7338     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
7339 
7340     int vlen_enc = vector_length_encoding(this, $src1);
7341     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7342     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
7343     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
7344 
7345     // Comparison i
7346     switch (src1_elem_bt) {
7347       case T_BYTE: {
7348         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7349         break;
7350       }
7351       case T_SHORT: {
7352         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7353         break;
7354       }
7355       case T_INT: {
7356         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7357         break;
7358       }
7359       case T_LONG: {
7360         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7361         break;
7362       }
7363       default: assert(false, "%s", type2name(src1_elem_bt));
7364     }
7365   %}
7366   ins_pipe( pipe_slow );
7367 %}
7368 
7369 // Extract
7370 
7371 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
7372   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
7373   match(Set dst (ExtractI src idx));
7374   match(Set dst (ExtractS src idx));
7375 #ifdef _LP64
7376   match(Set dst (ExtractB src idx));
7377 #endif
7378   format %{ "extractI $dst,$src,$idx\t!" %}
7379   ins_encode %{
7380     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7381 
7382     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7383     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7384   %}
7385   ins_pipe( pipe_slow );
7386 %}
7387 
7388 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
7389   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
7390             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
7391   match(Set dst (ExtractI src idx));
7392   match(Set dst (ExtractS src idx));
7393 #ifdef _LP64
7394   match(Set dst (ExtractB src idx));
7395 #endif
7396   effect(TEMP vtmp);
7397   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
7398   ins_encode %{
7399     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7400 
7401     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7402     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7403     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
7404   %}
7405   ins_pipe( pipe_slow );
7406 %}
7407 
7408 #ifdef _LP64
7409 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
7410   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
7411   match(Set dst (ExtractL src idx));
7412   format %{ "extractL $dst,$src,$idx\t!" %}
7413   ins_encode %{
7414     assert(UseSSE >= 4, "required");
7415     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7416 
7417     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7418   %}
7419   ins_pipe( pipe_slow );
7420 %}
7421 
7422 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
7423   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7424             Matcher::vector_length(n->in(1)) == 8);  // src
7425   match(Set dst (ExtractL src idx));
7426   effect(TEMP vtmp);
7427   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
7428   ins_encode %{
7429     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7430 
7431     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7432     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
7433   %}
7434   ins_pipe( pipe_slow );
7435 %}
7436 #endif
7437 
7438 instruct extractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7439   predicate(Matcher::vector_length(n->in(1)) <= 4);
7440   match(Set dst (ExtractF src idx));
7441   effect(TEMP dst, TEMP tmp, TEMP vtmp);
7442   format %{ "extractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7443   ins_encode %{
7444     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7445 
7446     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $tmp$$Register, $vtmp$$XMMRegister);
7447   %}
7448   ins_pipe( pipe_slow );
7449 %}
7450 
7451 instruct vextractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7452   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
7453             Matcher::vector_length(n->in(1)/*src*/) == 16);
7454   match(Set dst (ExtractF src idx));
7455   effect(TEMP tmp, TEMP vtmp);
7456   format %{ "vextractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7457   ins_encode %{
7458     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7459 
7460     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7461     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant, $tmp$$Register);
7462   %}
7463   ins_pipe( pipe_slow );
7464 %}
7465 
7466 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
7467   predicate(Matcher::vector_length(n->in(1)) == 2); // src
7468   match(Set dst (ExtractD src idx));
7469   format %{ "extractD $dst,$src,$idx\t!" %}
7470   ins_encode %{
7471     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7472 
7473     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7474   %}
7475   ins_pipe( pipe_slow );
7476 %}
7477 
7478 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
7479   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7480             Matcher::vector_length(n->in(1)) == 8);  // src
7481   match(Set dst (ExtractD src idx));
7482   effect(TEMP vtmp);
7483   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
7484   ins_encode %{
7485     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7486 
7487     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7488     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
7489   %}
7490   ins_pipe( pipe_slow );
7491 %}
7492 
7493 // --------------------------------- Vector Blend --------------------------------------
7494 
7495 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
7496   predicate(UseAVX == 0);
7497   match(Set dst (VectorBlend (Binary dst src) mask));
7498   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
7499   effect(TEMP tmp);
7500   ins_encode %{
7501     assert(UseSSE >= 4, "required");
7502 
7503     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
7504       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
7505     }
7506     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
7507   %}
7508   ins_pipe( pipe_slow );
7509 %}
7510 
7511 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
7512   predicate(UseAVX > 0 &&
7513             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
7514             Matcher::vector_length_in_bytes(n) <= 32 &&
7515             is_integral_type(Matcher::vector_element_basic_type(n)));
7516   match(Set dst (VectorBlend (Binary src1 src2) mask));
7517   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7518   ins_encode %{
7519     int vlen_enc = vector_length_encoding(this);
7520     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7521   %}
7522   ins_pipe( pipe_slow );
7523 %}
7524 
7525 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
7526   predicate(UseAVX > 0 &&
7527             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
7528             Matcher::vector_length_in_bytes(n) <= 32 &&
7529             !is_integral_type(Matcher::vector_element_basic_type(n)));
7530   match(Set dst (VectorBlend (Binary src1 src2) mask));
7531   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7532   ins_encode %{
7533     int vlen_enc = vector_length_encoding(this);
7534     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7535   %}
7536   ins_pipe( pipe_slow );
7537 %}
7538 
7539 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, rRegP scratch, kReg ktmp) %{
7540   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
7541             n->in(2)->bottom_type()->isa_vectmask() == NULL);
7542   match(Set dst (VectorBlend (Binary src1 src2) mask));
7543   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
7544   effect(TEMP scratch, TEMP ktmp);
7545   ins_encode %{
7546      int vlen_enc = Assembler::AVX_512bit;
7547      BasicType elem_bt = Matcher::vector_element_basic_type(this);
7548     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, $scratch$$Register);
7549     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
7550   %}
7551   ins_pipe( pipe_slow );
7552 %}
7553 
7554 
7555 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask, rRegP scratch) %{
7556   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
7557             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
7558              VM_Version::supports_avx512bw()));
7559   match(Set dst (VectorBlend (Binary src1 src2) mask));
7560   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
7561   effect(TEMP scratch);
7562   ins_encode %{
7563     int vlen_enc = vector_length_encoding(this);
7564     BasicType elem_bt = Matcher::vector_element_basic_type(this);
7565     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
7566   %}
7567   ins_pipe( pipe_slow );
7568 %}
7569 
7570 // --------------------------------- ABS --------------------------------------
7571 // a = |a|
7572 instruct vabsB_reg(vec dst, vec src) %{
7573   match(Set dst (AbsVB  src));
7574   ins_cost(450);
7575   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
7576   ins_encode %{
7577     uint vlen = Matcher::vector_length(this);
7578     if (vlen <= 16) {
7579       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7580     } else {
7581       int vlen_enc = vector_length_encoding(this);
7582       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7583     }
7584   %}
7585   ins_pipe( pipe_slow );
7586 %}
7587 
7588 instruct vabsS_reg(vec dst, vec src) %{
7589   match(Set dst (AbsVS  src));
7590   ins_cost(450);
7591   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
7592   ins_encode %{
7593     uint vlen = Matcher::vector_length(this);
7594     if (vlen <= 8) {
7595       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7596     } else {
7597       int vlen_enc = vector_length_encoding(this);
7598       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7599     }
7600   %}
7601   ins_pipe( pipe_slow );
7602 %}
7603 
7604 instruct vabsI_reg(vec dst, vec src) %{
7605   match(Set dst (AbsVI  src));
7606   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
7607   ins_cost(250);
7608   ins_encode %{
7609     uint vlen = Matcher::vector_length(this);
7610     if (vlen <= 4) {
7611       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7612     } else {
7613       int vlen_enc = vector_length_encoding(this);
7614       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7615     }
7616   %}
7617   ins_pipe( pipe_slow );
7618 %}
7619 
7620 instruct vabsL_reg(vec dst, vec src) %{
7621   match(Set dst (AbsVL  src));
7622   ins_cost(450);
7623   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
7624   ins_encode %{
7625     assert(UseAVX > 2, "required");
7626     int vlen_enc = vector_length_encoding(this);
7627     if (!VM_Version::supports_avx512vl()) {
7628       vlen_enc = Assembler::AVX_512bit;
7629     }
7630     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7631   %}
7632   ins_pipe( pipe_slow );
7633 %}
7634 
7635 // --------------------------------- ABSNEG --------------------------------------
7636 
7637 instruct vabsnegF(vec dst, vec src, rRegI scratch) %{
7638   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
7639   match(Set dst (AbsVF src));
7640   match(Set dst (NegVF src));
7641   effect(TEMP scratch);
7642   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
7643   ins_cost(150);
7644   ins_encode %{
7645     int opcode = this->ideal_Opcode();
7646     int vlen = Matcher::vector_length(this);
7647     if (vlen == 2) {
7648       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7649     } else {
7650       assert(vlen == 8 || vlen == 16, "required");
7651       int vlen_enc = vector_length_encoding(this);
7652       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7653     }
7654   %}
7655   ins_pipe( pipe_slow );
7656 %}
7657 
7658 instruct vabsneg4F(vec dst, rRegI scratch) %{
7659   predicate(Matcher::vector_length(n) == 4);
7660   match(Set dst (AbsVF dst));
7661   match(Set dst (NegVF dst));
7662   effect(TEMP scratch);
7663   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
7664   ins_cost(150);
7665   ins_encode %{
7666     int opcode = this->ideal_Opcode();
7667     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $scratch$$Register);
7668   %}
7669   ins_pipe( pipe_slow );
7670 %}
7671 
7672 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
7673   match(Set dst (AbsVD  src));
7674   match(Set dst (NegVD  src));
7675   effect(TEMP scratch);
7676   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
7677   ins_encode %{
7678     int opcode = this->ideal_Opcode();
7679     uint vlen = Matcher::vector_length(this);
7680     if (vlen == 2) {
7681       assert(UseSSE >= 2, "required");
7682       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7683     } else {
7684       int vlen_enc = vector_length_encoding(this);
7685       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7686     }
7687   %}
7688   ins_pipe( pipe_slow );
7689 %}
7690 
7691 //------------------------------------- VectorTest --------------------------------------------
7692 
7693 #ifdef _LP64
7694 instruct vptest_alltrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp1, legVec vtmp2, rFlagsReg cr) %{
7695   predicate(!VM_Version::supports_avx512bwdq() &&
7696             Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
7697             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
7698             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7699   match(Set dst (VectorTest src1 src2 ));
7700   effect(TEMP vtmp1, TEMP vtmp2, KILL cr);
7701   format %{ "vptest_alltrue_lt16 $dst,$src1, $src2\t! using $vtmp1, $vtmp2 and $cr as TEMP" %}
7702   ins_encode %{
7703     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7704     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
7705     __ setb(Assembler::carrySet, $dst$$Register);
7706     __ movzbl($dst$$Register, $dst$$Register);
7707   %}
7708   ins_pipe( pipe_slow );
7709 %}
7710 
7711 instruct vptest_alltrue_ge16(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7712   predicate(!VM_Version::supports_avx512bwdq() &&
7713             Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
7714             Matcher::vector_length_in_bytes(n->in(1)) <  64 &&
7715             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7716   match(Set dst (VectorTest src1 src2 ));
7717   effect(KILL cr);
7718   format %{ "vptest_alltrue_ge16  $dst,$src1, $src2\t! using $cr as TEMP" %}
7719   ins_encode %{
7720     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7721     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7722     __ setb(Assembler::carrySet, $dst$$Register);
7723     __ movzbl($dst$$Register, $dst$$Register);
7724   %}
7725   ins_pipe( pipe_slow );
7726 %}
7727 
7728 instruct vptest_alltrue_lt8_evex(rRegI dst, kReg src1, kReg src2, kReg kscratch, rFlagsReg cr) %{
7729   predicate(VM_Version::supports_avx512bwdq() &&
7730             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow &&
7731             n->in(1)->bottom_type()->isa_vectmask() &&
7732             Matcher::vector_length(n->in(1)) < 8);
7733   match(Set dst (VectorTest src1 src2));
7734   effect(KILL cr, TEMP kscratch);
7735   format %{ "vptest_alltrue_lt8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
7736   ins_encode %{
7737     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7738     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7739     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7740     uint masklen = Matcher::vector_length(this, $src1);
7741     __ alltrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister, $kscratch$$KRegister);
7742   %}
7743   ins_pipe( pipe_slow );
7744 %}
7745 
7746 
7747 instruct vptest_alltrue_ge8_evex(rRegI dst, kReg src1, kReg src2, rFlagsReg cr) %{
7748   predicate(VM_Version::supports_avx512bwdq() &&
7749             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow &&
7750             n->in(1)->bottom_type()->isa_vectmask() &&
7751             Matcher::vector_length(n->in(1)) >= 8);
7752   match(Set dst (VectorTest src1 src2));
7753   effect(KILL cr);
7754   format %{ "vptest_alltrue_ge8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
7755   ins_encode %{
7756     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7757     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7758     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7759     uint masklen = Matcher::vector_length(this, $src1);
7760     __ alltrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister, knoreg);
7761   %}
7762   ins_pipe( pipe_slow );
7763 %}
7764 
7765 
7766 instruct vptest_anytrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp, rFlagsReg cr) %{
7767   predicate(!VM_Version::supports_avx512bwdq() &&
7768             Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
7769             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
7770             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7771   match(Set dst (VectorTest src1 src2 ));
7772   effect(TEMP vtmp, KILL cr);
7773   format %{ "vptest_anytrue_lt16 $dst,$src1,$src2\t! using $vtmp, $cr as TEMP" %}
7774   ins_encode %{
7775     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7776     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7777     __ setb(Assembler::notZero, $dst$$Register);
7778     __ movzbl($dst$$Register, $dst$$Register);
7779   %}
7780   ins_pipe( pipe_slow );
7781 %}
7782 
7783 instruct vptest_anytrue_ge16(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7784   predicate(!VM_Version::supports_avx512bwdq() &&
7785             Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
7786             Matcher::vector_length_in_bytes(n->in(1)) < 64  &&
7787             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7788   match(Set dst (VectorTest src1 src2 ));
7789   effect(KILL cr);
7790   format %{ "vptest_anytrue_ge16 $dst,$src1,$src2\t! using $cr as TEMP" %}
7791   ins_encode %{
7792     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7793     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7794     __ setb(Assembler::notZero, $dst$$Register);
7795     __ movzbl($dst$$Register, $dst$$Register);
7796   %}
7797   ins_pipe( pipe_slow );
7798 %}
7799 
7800 instruct vptest_anytrue_evex(rRegI dst, kReg src1, kReg src2, rFlagsReg cr) %{
7801   predicate(VM_Version::supports_avx512bwdq() &&
7802             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7803   match(Set dst (VectorTest src1 src2));
7804   effect(KILL cr);
7805   format %{ "vptest_anytrue_lt8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
7806   ins_encode %{
7807     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7808     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7809     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7810     uint  masklen = Matcher::vector_length(this, $src1);
7811     __ anytrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister);
7812   %}
7813   ins_pipe( pipe_slow );
7814 %}
7815 
7816 instruct cmpvptest_anytrue_lt16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero, legVec vtmp) %{
7817   predicate(!VM_Version::supports_avx512bwdq() &&
7818             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 4 &&
7819             Matcher::vector_length_in_bytes(n->in(1)->in(1)) < 16 &&
7820             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7821   match(Set cr (CmpI (VectorTest src1 src2) zero));
7822   effect(TEMP vtmp);
7823   format %{ "cmpvptest_anytrue_lt16 $src1,$src2\t! using $vtmp as TEMP" %}
7824   ins_encode %{
7825     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7826     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7827   %}
7828   ins_pipe( pipe_slow );
7829 %}
7830 
7831 instruct cmpvptest_anytrue_ge16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero) %{
7832   predicate(!VM_Version::supports_avx512bwdq() &&
7833             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 16 &&
7834             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <  64 &&
7835             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7836   match(Set cr (CmpI (VectorTest src1 src2) zero));
7837   format %{ "cmpvptest_anytrue_ge16 $src1,$src2\t!" %}
7838   ins_encode %{
7839     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7840     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7841   %}
7842   ins_pipe( pipe_slow );
7843 %}
7844 
7845 instruct cmpvptest_anytrue_evex(rFlagsReg cr, kReg src1, kReg src2, immI_0 zero) %{
7846   predicate(VM_Version::supports_avx512bwdq() &&
7847             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7848   match(Set cr (CmpI (VectorTest src1 src2) zero));
7849   format %{ "cmpvptest_anytrue_evex $src1,$src2\t!" %}
7850   ins_encode %{
7851     uint masklen = Matcher::vector_length(this, $src1);
7852     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7853     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7854     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7855     masklen = masklen < 8 ? 8 : masklen;
7856     __ ktest(masklen, $src1$$KRegister, $src2$$KRegister);
7857   %}
7858   ins_pipe( pipe_slow );
7859 %}
7860 #endif
7861 
7862 //------------------------------------- LoadMask --------------------------------------------
7863 
7864 instruct loadMask(legVec dst, legVec src) %{
7865   predicate(n->bottom_type()->isa_vectmask() == NULL && !VM_Version::supports_avx512vlbw());
7866   match(Set dst (VectorLoadMask src));
7867   effect(TEMP dst);
7868   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
7869   ins_encode %{
7870     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
7871     BasicType elem_bt = Matcher::vector_element_basic_type(this);
7872     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
7873   %}
7874   ins_pipe( pipe_slow );
7875 %}
7876 
7877 instruct loadMask64(kReg dst, vec src, vec xtmp, rRegI tmp) %{
7878   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
7879   match(Set dst (VectorLoadMask src));
7880   effect(TEMP xtmp, TEMP tmp);
7881   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp and $tmp as TEMP" %}
7882   ins_encode %{
7883     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
7884                         $tmp$$Register, true, Assembler::AVX_512bit);
7885   %}
7886   ins_pipe( pipe_slow );
7887 %}
7888 
7889 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
7890   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
7891   match(Set dst (VectorLoadMask src));
7892   effect(TEMP xtmp);
7893   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
7894   ins_encode %{
7895     int vlen_enc = vector_length_encoding(in(1));
7896     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
7897                         noreg, false, vlen_enc);
7898   %}
7899   ins_pipe( pipe_slow );
7900 %}
7901 
7902 //------------------------------------- StoreMask --------------------------------------------
7903 
7904 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
7905   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
7906   match(Set dst (VectorStoreMask src size));
7907   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
7908   ins_encode %{
7909     int vlen = Matcher::vector_length(this);
7910     if (vlen <= 16 && UseAVX <= 2) {
7911       assert(UseSSE >= 3, "required");
7912       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7913     } else {
7914       assert(UseAVX > 0, "required");
7915       int src_vlen_enc = vector_length_encoding(this, $src);
7916       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7917     }
7918   %}
7919   ins_pipe( pipe_slow );
7920 %}
7921 
7922 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
7923   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
7924   match(Set dst (VectorStoreMask src size));
7925   effect(TEMP_DEF dst, TEMP xtmp);
7926   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
7927   ins_encode %{
7928     int vlen_enc = Assembler::AVX_128bit;
7929     int vlen = Matcher::vector_length(this);
7930     if (vlen <= 8) {
7931       assert(UseSSE >= 3, "required");
7932       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
7933       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7934       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
7935     } else {
7936       assert(UseAVX > 0, "required");
7937       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
7938       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7939       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7940     }
7941   %}
7942   ins_pipe( pipe_slow );
7943 %}
7944 
7945 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
7946   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
7947   match(Set dst (VectorStoreMask src size));
7948   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
7949   effect(TEMP_DEF dst, TEMP xtmp);
7950   ins_encode %{
7951     int vlen_enc = Assembler::AVX_128bit;
7952     int vlen = Matcher::vector_length(this);
7953     if (vlen <= 4) {
7954       assert(UseSSE >= 3, "required");
7955       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
7956       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7957       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
7958       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
7959     } else {
7960       assert(UseAVX > 0, "required");
7961       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
7962       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
7963       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7964       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
7965       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7966     }
7967   %}
7968   ins_pipe( pipe_slow );
7969 %}
7970 
7971 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
7972   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
7973   match(Set dst (VectorStoreMask src size));
7974   effect(TEMP_DEF dst, TEMP xtmp);
7975   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
7976   ins_encode %{
7977     assert(UseSSE >= 3, "required");
7978     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
7979     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
7980     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
7981     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
7982     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
7983   %}
7984   ins_pipe( pipe_slow );
7985 %}
7986 
7987 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
7988   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
7989   match(Set dst (VectorStoreMask src size));
7990   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
7991   effect(TEMP_DEF dst, TEMP vtmp);
7992   ins_encode %{
7993     int vlen_enc = Assembler::AVX_128bit;
7994     __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
7995     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
7996     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
7997     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
7998     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
7999     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8000     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8001   %}
8002   ins_pipe( pipe_slow );
8003 %}
8004 
8005 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
8006   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8007   match(Set dst (VectorStoreMask src size));
8008   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8009   ins_encode %{
8010     int src_vlen_enc = vector_length_encoding(this, $src);
8011     int dst_vlen_enc = vector_length_encoding(this);
8012     if (!VM_Version::supports_avx512vl()) {
8013       src_vlen_enc = Assembler::AVX_512bit;
8014     }
8015     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
8016     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8017   %}
8018   ins_pipe( pipe_slow );
8019 %}
8020 
8021 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
8022   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8023   match(Set dst (VectorStoreMask src size));
8024   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8025   ins_encode %{
8026     int src_vlen_enc = vector_length_encoding(this, $src);
8027     int dst_vlen_enc = vector_length_encoding(this);
8028     if (!VM_Version::supports_avx512vl()) {
8029       src_vlen_enc = Assembler::AVX_512bit;
8030     }
8031     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
8032     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8033   %}
8034   ins_pipe( pipe_slow );
8035 %}
8036 
8037 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size, rRegI tmp) %{
8038   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
8039   match(Set dst (VectorStoreMask mask size));
8040   effect(TEMP_DEF dst, TEMP tmp);
8041   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
8042   ins_encode %{
8043     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
8044     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
8045                  false, Assembler::AVX_512bit, $tmp$$Register);
8046     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
8047   %}
8048   ins_pipe( pipe_slow );
8049 %}
8050 
8051 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
8052   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
8053   match(Set dst (VectorStoreMask mask size));
8054   effect(TEMP_DEF dst);
8055   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
8056   ins_encode %{
8057     int dst_vlen_enc = vector_length_encoding(this);
8058     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
8059     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8060   %}
8061   ins_pipe( pipe_slow );
8062 %}
8063 
8064 instruct vmaskcast_evex(kReg dst) %{
8065   predicate(Matcher::vector_length(n) == Matcher::vector_length(n->in(1)));
8066   match(Set dst (VectorMaskCast dst));
8067   ins_cost(0);
8068   format %{ "vector_mask_cast $dst" %}
8069   ins_encode %{
8070     // empty
8071   %}
8072   ins_pipe(empty);
8073 %}
8074 
8075 instruct vmaskcast(vec dst) %{
8076   predicate((Matcher::vector_length(n) == Matcher::vector_length(n->in(1))) &&
8077             (Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))));
8078   match(Set dst (VectorMaskCast dst));
8079   ins_cost(0);
8080   format %{ "vector_mask_cast $dst" %}
8081   ins_encode %{
8082     // empty
8083   %}
8084   ins_pipe(empty);
8085 %}
8086 
8087 //-------------------------------- Load Iota Indices ----------------------------------
8088 
8089 instruct loadIotaIndices(vec dst, immI_0 src, rRegP scratch) %{
8090   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
8091   match(Set dst (VectorLoadConst src));
8092   effect(TEMP scratch);
8093   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
8094   ins_encode %{
8095      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8096      __ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen_in_bytes);
8097   %}
8098   ins_pipe( pipe_slow );
8099 %}
8100 
8101 //-------------------------------- Rearrange ----------------------------------
8102 
8103 // LoadShuffle/Rearrange for Byte
8104 
8105 instruct loadShuffleB(vec dst) %{
8106   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
8107   match(Set dst (VectorLoadShuffle dst));
8108   format %{ "vector_load_shuffle $dst, $dst" %}
8109   ins_encode %{
8110     // empty
8111   %}
8112   ins_pipe( pipe_slow );
8113 %}
8114 
8115 instruct rearrangeB(vec dst, vec shuffle) %{
8116   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8117             Matcher::vector_length(n) < 32);
8118   match(Set dst (VectorRearrange dst shuffle));
8119   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8120   ins_encode %{
8121     assert(UseSSE >= 4, "required");
8122     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8123   %}
8124   ins_pipe( pipe_slow );
8125 %}
8126 
8127 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
8128   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8129             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
8130   match(Set dst (VectorRearrange src shuffle));
8131   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
8132   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
8133   ins_encode %{
8134     assert(UseAVX >= 2, "required");
8135     // Swap src into vtmp1
8136     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
8137     // Shuffle swapped src to get entries from other 128 bit lane
8138     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8139     // Shuffle original src to get entries from self 128 bit lane
8140     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8141     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
8142     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
8143     // Perform the blend
8144     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
8145   %}
8146   ins_pipe( pipe_slow );
8147 %}
8148 
8149 instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{
8150   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8151             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
8152   match(Set dst (VectorRearrange src shuffle));
8153   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8154   ins_encode %{
8155     int vlen_enc = vector_length_encoding(this);
8156     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8157   %}
8158   ins_pipe( pipe_slow );
8159 %}
8160 
8161 // LoadShuffle/Rearrange for Short
8162 
8163 instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{
8164   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8165             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
8166   match(Set dst (VectorLoadShuffle src));
8167   effect(TEMP dst, TEMP vtmp, TEMP scratch);
8168   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
8169   ins_encode %{
8170     // Create a byte shuffle mask from short shuffle mask
8171     // only byte shuffle instruction available on these platforms
8172     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8173     if (UseAVX == 0) {
8174       assert(vlen_in_bytes <= 16, "required");
8175       // Multiply each shuffle by two to get byte index
8176       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
8177       __ psllw($vtmp$$XMMRegister, 1);
8178 
8179       // Duplicate to create 2 copies of byte index
8180       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
8181       __ psllw($dst$$XMMRegister, 8);
8182       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
8183 
8184       // Add one to get alternate byte index
8185       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
8186       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
8187     } else {
8188       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
8189       int vlen_enc = vector_length_encoding(this);
8190       // Multiply each shuffle by two to get byte index
8191       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
8192       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
8193 
8194       // Duplicate to create 2 copies of byte index
8195       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
8196       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8197 
8198       // Add one to get alternate byte index
8199       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, $scratch$$Register);
8200     }
8201   %}
8202   ins_pipe( pipe_slow );
8203 %}
8204 
8205 instruct rearrangeS(vec dst, vec shuffle) %{
8206   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8207             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
8208   match(Set dst (VectorRearrange dst shuffle));
8209   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8210   ins_encode %{
8211     assert(UseSSE >= 4, "required");
8212     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8213   %}
8214   ins_pipe( pipe_slow );
8215 %}
8216 
8217 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
8218   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8219             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
8220   match(Set dst (VectorRearrange src shuffle));
8221   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
8222   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
8223   ins_encode %{
8224     assert(UseAVX >= 2, "required");
8225     // Swap src into vtmp1
8226     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
8227     // Shuffle swapped src to get entries from other 128 bit lane
8228     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8229     // Shuffle original src to get entries from self 128 bit lane
8230     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8231     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
8232     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
8233     // Perform the blend
8234     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
8235   %}
8236   ins_pipe( pipe_slow );
8237 %}
8238 
8239 instruct loadShuffleS_evex(vec dst, vec src) %{
8240   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8241             VM_Version::supports_avx512bw());
8242   match(Set dst (VectorLoadShuffle src));
8243   format %{ "vector_load_shuffle $dst, $src" %}
8244   ins_encode %{
8245     int vlen_enc = vector_length_encoding(this);
8246     if (!VM_Version::supports_avx512vl()) {
8247       vlen_enc = Assembler::AVX_512bit;
8248     }
8249     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8250   %}
8251   ins_pipe( pipe_slow );
8252 %}
8253 
8254 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
8255   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8256             VM_Version::supports_avx512bw());
8257   match(Set dst (VectorRearrange src shuffle));
8258   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8259   ins_encode %{
8260     int vlen_enc = vector_length_encoding(this);
8261     if (!VM_Version::supports_avx512vl()) {
8262       vlen_enc = Assembler::AVX_512bit;
8263     }
8264     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8265   %}
8266   ins_pipe( pipe_slow );
8267 %}
8268 
8269 // LoadShuffle/Rearrange for Integer and Float
8270 
8271 instruct loadShuffleI(vec dst, vec src, vec vtmp, rRegP scratch) %{
8272   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8273             Matcher::vector_length(n) == 4 && UseAVX < 2);
8274   match(Set dst (VectorLoadShuffle src));
8275   effect(TEMP dst, TEMP vtmp, TEMP scratch);
8276   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
8277   ins_encode %{
8278     assert(UseSSE >= 4, "required");
8279 
8280     // Create a byte shuffle mask from int shuffle mask
8281     // only byte shuffle instruction available on these platforms
8282 
8283     // Duplicate and multiply each shuffle by 4
8284     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
8285     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
8286     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
8287     __ psllw($vtmp$$XMMRegister, 2);
8288 
8289     // Duplicate again to create 4 copies of byte index
8290     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
8291     __ psllw($dst$$XMMRegister, 8);
8292     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
8293 
8294     // Add 3,2,1,0 to get alternate byte index
8295     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), $scratch$$Register);
8296     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
8297   %}
8298   ins_pipe( pipe_slow );
8299 %}
8300 
8301 instruct rearrangeI(vec dst, vec shuffle) %{
8302  predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8303            Matcher::vector_length(n) == 4 && UseAVX < 2);
8304   match(Set dst (VectorRearrange dst shuffle));
8305   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8306   ins_encode %{
8307     assert(UseSSE >= 4, "required");
8308     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8309   %}
8310   ins_pipe( pipe_slow );
8311 %}
8312 
8313 instruct loadShuffleI_avx(vec dst, vec src) %{
8314   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8315             UseAVX >= 2);
8316   match(Set dst (VectorLoadShuffle src));
8317   format %{ "vector_load_shuffle $dst, $src" %}
8318   ins_encode %{
8319   int vlen_enc = vector_length_encoding(this);
8320     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8321   %}
8322   ins_pipe( pipe_slow );
8323 %}
8324 
8325 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
8326   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8327             UseAVX >= 2);
8328   match(Set dst (VectorRearrange src shuffle));
8329   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8330   ins_encode %{
8331     int vlen_enc = vector_length_encoding(this);
8332     if (vlen_enc == Assembler::AVX_128bit) {
8333       vlen_enc = Assembler::AVX_256bit;
8334     }
8335     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8336   %}
8337   ins_pipe( pipe_slow );
8338 %}
8339 
8340 // LoadShuffle/Rearrange for Long and Double
8341 
8342 instruct loadShuffleL(vec dst, vec src, vec vtmp, rRegP scratch) %{
8343   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8344             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
8345   match(Set dst (VectorLoadShuffle src));
8346   effect(TEMP dst, TEMP vtmp, TEMP scratch);
8347   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
8348   ins_encode %{
8349     assert(UseAVX >= 2, "required");
8350 
8351     int vlen_enc = vector_length_encoding(this);
8352     // Create a double word shuffle mask from long shuffle mask
8353     // only double word shuffle instruction available on these platforms
8354 
8355     // Multiply each shuffle by two to get double word index
8356     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
8357     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
8358 
8359     // Duplicate each double word shuffle
8360     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
8361     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8362 
8363     // Add one to get alternate double word index
8364     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, $scratch$$Register);
8365   %}
8366   ins_pipe( pipe_slow );
8367 %}
8368 
8369 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
8370   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8371             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
8372   match(Set dst (VectorRearrange src shuffle));
8373   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8374   ins_encode %{
8375     assert(UseAVX >= 2, "required");
8376 
8377     int vlen_enc = vector_length_encoding(this);
8378     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8379   %}
8380   ins_pipe( pipe_slow );
8381 %}
8382 
8383 instruct loadShuffleL_evex(vec dst, vec src) %{
8384   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8385             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
8386   match(Set dst (VectorLoadShuffle src));
8387   format %{ "vector_load_shuffle $dst, $src" %}
8388   ins_encode %{
8389     assert(UseAVX > 2, "required");
8390 
8391     int vlen_enc = vector_length_encoding(this);
8392     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8393   %}
8394   ins_pipe( pipe_slow );
8395 %}
8396 
8397 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
8398   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8399             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
8400   match(Set dst (VectorRearrange src shuffle));
8401   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8402   ins_encode %{
8403     assert(UseAVX > 2, "required");
8404 
8405     int vlen_enc = vector_length_encoding(this);
8406     if (vlen_enc == Assembler::AVX_128bit) {
8407       vlen_enc = Assembler::AVX_256bit;
8408     }
8409     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8410   %}
8411   ins_pipe( pipe_slow );
8412 %}
8413 
8414 // --------------------------------- FMA --------------------------------------
8415 // a * b + c
8416 
8417 instruct vfmaF_reg(vec a, vec b, vec c) %{
8418   match(Set c (FmaVF  c (Binary a b)));
8419   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8420   ins_cost(150);
8421   ins_encode %{
8422     assert(UseFMA, "not enabled");
8423     int vlen_enc = vector_length_encoding(this);
8424     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8425   %}
8426   ins_pipe( pipe_slow );
8427 %}
8428 
8429 instruct vfmaF_mem(vec a, memory b, vec c) %{
8430   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8431   match(Set c (FmaVF  c (Binary a (LoadVector b))));
8432   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8433   ins_cost(150);
8434   ins_encode %{
8435     assert(UseFMA, "not enabled");
8436     int vlen_enc = vector_length_encoding(this);
8437     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8438   %}
8439   ins_pipe( pipe_slow );
8440 %}
8441 
8442 instruct vfmaD_reg(vec a, vec b, vec c) %{
8443   match(Set c (FmaVD  c (Binary a b)));
8444   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8445   ins_cost(150);
8446   ins_encode %{
8447     assert(UseFMA, "not enabled");
8448     int vlen_enc = vector_length_encoding(this);
8449     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8450   %}
8451   ins_pipe( pipe_slow );
8452 %}
8453 
8454 instruct vfmaD_mem(vec a, memory b, vec c) %{
8455   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8456   match(Set c (FmaVD  c (Binary a (LoadVector b))));
8457   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8458   ins_cost(150);
8459   ins_encode %{
8460     assert(UseFMA, "not enabled");
8461     int vlen_enc = vector_length_encoding(this);
8462     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8463   %}
8464   ins_pipe( pipe_slow );
8465 %}
8466 
8467 // --------------------------------- Vector Multiply Add --------------------------------------
8468 
8469 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
8470   predicate(UseAVX == 0);
8471   match(Set dst (MulAddVS2VI dst src1));
8472   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
8473   ins_encode %{
8474     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
8475   %}
8476   ins_pipe( pipe_slow );
8477 %}
8478 
8479 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
8480   predicate(UseAVX > 0);
8481   match(Set dst (MulAddVS2VI src1 src2));
8482   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
8483   ins_encode %{
8484     int vlen_enc = vector_length_encoding(this);
8485     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8486   %}
8487   ins_pipe( pipe_slow );
8488 %}
8489 
8490 // --------------------------------- Vector Multiply Add Add ----------------------------------
8491 
8492 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
8493   predicate(VM_Version::supports_avx512_vnni());
8494   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
8495   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
8496   ins_encode %{
8497     assert(UseAVX > 2, "required");
8498     int vlen_enc = vector_length_encoding(this);
8499     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8500   %}
8501   ins_pipe( pipe_slow );
8502   ins_cost(10);
8503 %}
8504 
8505 // --------------------------------- PopCount --------------------------------------
8506 
8507 instruct vpopcountI(vec dst, vec src) %{
8508   match(Set dst (PopCountVI src));
8509   format %{ "vpopcntd  $dst,$src\t! vector popcount packedI" %}
8510   ins_encode %{
8511     assert(UsePopCountInstruction, "not enabled");
8512 
8513     int vlen_enc = vector_length_encoding(this);
8514     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8515   %}
8516   ins_pipe( pipe_slow );
8517 %}
8518 
8519 // --------------------------------- Bitwise Ternary Logic ----------------------------------
8520 
8521 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
8522   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
8523   effect(TEMP dst);
8524   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8525   ins_encode %{
8526     int vector_len = vector_length_encoding(this);
8527     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
8528   %}
8529   ins_pipe( pipe_slow );
8530 %}
8531 
8532 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
8533   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
8534   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
8535   effect(TEMP dst);
8536   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8537   ins_encode %{
8538     int vector_len = vector_length_encoding(this);
8539     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
8540   %}
8541   ins_pipe( pipe_slow );
8542 %}
8543 
8544 // --------------------------------- Rotation Operations ----------------------------------
8545 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
8546   match(Set dst (RotateLeftV src shift));
8547   match(Set dst (RotateRightV src shift));
8548   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
8549   ins_encode %{
8550     int opcode      = this->ideal_Opcode();
8551     int vector_len  = vector_length_encoding(this);
8552     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8553     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
8554   %}
8555   ins_pipe( pipe_slow );
8556 %}
8557 
8558 instruct vprorate(vec dst, vec src, vec shift) %{
8559   match(Set dst (RotateLeftV src shift));
8560   match(Set dst (RotateRightV src shift));
8561   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
8562   ins_encode %{
8563     int opcode      = this->ideal_Opcode();
8564     int vector_len  = vector_length_encoding(this);
8565     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8566     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8567   %}
8568   ins_pipe( pipe_slow );
8569 %}
8570 
8571 #ifdef _LP64
8572 // ---------------------------------- Masked Operations ------------------------------------
8573 
8574 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
8575   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
8576   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
8577   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
8578   ins_encode %{
8579     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
8580     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
8581 
8582     Label DONE;
8583     int vlen_enc = vector_length_encoding(this, $src1);
8584     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
8585 
8586     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
8587     __ mov64($dst$$Register, -1L);
8588     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
8589     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
8590     __ jccb(Assembler::carrySet, DONE);
8591     __ kmovql($dst$$Register, $ktmp1$$KRegister);
8592     __ notq($dst$$Register);
8593     __ tzcntq($dst$$Register, $dst$$Register);
8594     __ bind(DONE);
8595   %}
8596   ins_pipe( pipe_slow );
8597 %}
8598 
8599 
8600 instruct vmasked_load64(vec dst, memory mem, kReg mask) %{
8601   match(Set dst (LoadVectorMasked mem mask));
8602   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
8603   ins_encode %{
8604     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
8605     int vector_len = vector_length_encoding(this);
8606     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, vector_len);
8607   %}
8608   ins_pipe( pipe_slow );
8609 %}
8610 
8611 instruct vmask_gen(kReg dst, rRegL len, rRegL temp) %{
8612   match(Set dst (VectorMaskGen len));
8613   effect(TEMP temp);
8614   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
8615   ins_encode %{
8616     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
8617   %}
8618   ins_pipe( pipe_slow );
8619 %}
8620 
8621 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
8622   match(Set dst (VectorMaskGen len));
8623   format %{ "vector_mask_gen $len \t! vector mask generator" %}
8624   effect(TEMP temp);
8625   ins_encode %{
8626     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
8627     __ kmovql($dst$$KRegister, $temp$$Register);
8628   %}
8629   ins_pipe( pipe_slow );
8630 %}
8631 
8632 instruct vmasked_store64(memory mem, vec src, kReg mask) %{
8633   match(Set mem (StoreVectorMasked mem (Binary src mask)));
8634   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
8635   ins_encode %{
8636     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
8637     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
8638     int vector_len = vector_length_encoding(src_node);
8639     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, vector_len);
8640   %}
8641   ins_pipe( pipe_slow );
8642 %}
8643 
8644 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
8645   predicate(n->in(1)->bottom_type()->isa_vectmask());
8646   match(Set dst (VectorMaskToLong mask));
8647   effect(TEMP dst, KILL cr);
8648   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
8649   ins_encode %{
8650     int mask_len = Matcher::vector_length(this, $mask);
8651     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8652     if (VM_Version::supports_avx512vlbw()) {
8653       __ kmovql($dst$$Register, $mask$$KRegister);
8654     } else {
8655       assert(mask_len <= 16, "");
8656       __ kmovwl($dst$$Register, $mask$$KRegister);
8657     }
8658     // Mask generated out of partial vector comparisons/replicate/mask manipulation
8659     // operations needs to be clipped.
8660     int mask_size = mask_len * type2aelembytes(mbt);
8661     if (mask_size < 16) {
8662       __ andq($dst$$Register, (((jlong)1 << mask_len) - 1));
8663     }
8664   %}
8665   ins_pipe( pipe_slow );
8666 %}
8667 
8668 instruct vmask_tolong_avx(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
8669   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL &&
8670             n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
8671   match(Set dst (VectorMaskToLong mask));
8672   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
8673   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
8674   ins_encode %{
8675     int mask_len = Matcher::vector_length(this, $mask);
8676     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8677     int vlen_enc = vector_length_encoding(this, $mask);
8678     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
8679     __ vpsubb($xtmp$$XMMRegister, $xtmp$$XMMRegister, $mask$$XMMRegister, vlen_enc);
8680     __ vpmovmskb($dst$$Register, $xtmp$$XMMRegister, vlen_enc);
8681     // Mask generated out of partial vector comparisons/replicate/mask manipulation
8682     // operations needs to be clipped.
8683     int mask_size = mask_len * type2aelembytes(mbt);
8684     if (mask_size < 16) {
8685       __ andq($dst$$Register, (((jlong)1 << mask_len) - 1));
8686     }
8687   %}
8688   ins_pipe( pipe_slow );
8689 %}
8690 
8691 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
8692   predicate(n->in(1)->bottom_type()->isa_vectmask());
8693   match(Set dst (VectorMaskTrueCount mask));
8694   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
8695   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
8696   ins_encode %{
8697     int opcode = this->ideal_Opcode();
8698     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8699     int mask_len = Matcher::vector_length(this, $mask);
8700     int mask_size = mask_len * type2aelembytes(mbt);
8701     int vlen_enc = vector_length_encoding(this, $mask);
8702     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister, $tmp$$Register,
8703                              mask_len, mask_size, vlen_enc);
8704   %}
8705   ins_pipe( pipe_slow );
8706 %}
8707 
8708 instruct vmask_truecount_avx(rRegI dst, vec mask, rRegL tmp, vec xtmp, vec xtmp1, rFlagsReg cr) %{
8709   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
8710   match(Set dst (VectorMaskTrueCount mask));
8711   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, TEMP xtmp1, KILL cr);
8712   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp and $xtmp1 as TEMP" %}
8713   ins_encode %{
8714     int opcode = this->ideal_Opcode();
8715     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8716     int mask_len = Matcher::vector_length(this, $mask);
8717     int mask_size = mask_len * type2aelembytes(mbt);
8718     int vlen_enc = vector_length_encoding(this, $mask);
8719     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8720                              $xtmp1$$XMMRegister, $tmp$$Register, mask_len, mask_size, vlen_enc);
8721   %}
8722   ins_pipe( pipe_slow );
8723 %}
8724 
8725 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
8726   predicate(n->in(1)->bottom_type()->isa_vectmask());
8727   match(Set dst (VectorMaskFirstTrue mask));
8728   match(Set dst (VectorMaskLastTrue mask));
8729   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
8730   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
8731   ins_encode %{
8732     int opcode = this->ideal_Opcode();
8733     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8734     int mask_len = Matcher::vector_length(this, $mask);
8735     int mask_size = mask_len * type2aelembytes(mbt);
8736     int vlen_enc = vector_length_encoding(this, $mask);
8737     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister, $tmp$$Register, mask_len,
8738                              mask_size, vlen_enc);
8739   %}
8740   ins_pipe( pipe_slow );
8741 %}
8742 
8743 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, rRegL tmp, vec xtmp, vec xtmp1, rFlagsReg cr) %{
8744   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
8745   match(Set dst (VectorMaskFirstTrue mask));
8746   match(Set dst (VectorMaskLastTrue mask));
8747   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, TEMP xtmp1, KILL cr);
8748   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp and $xtmp1 as TEMP" %}
8749   ins_encode %{
8750     int opcode = this->ideal_Opcode();
8751     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8752     int mask_len = Matcher::vector_length(this, $mask);
8753     int mask_size = mask_len * type2aelembytes(mbt);
8754     int vlen_enc = vector_length_encoding(this, $mask);
8755     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8756                              $xtmp1$$XMMRegister, $tmp$$Register, mask_len, mask_size, vlen_enc);
8757   %}
8758   ins_pipe( pipe_slow );
8759 %}
8760 #endif // _LP64
8761 
8762 // ---------------------------------- Vector Masked Operations ------------------------------------
8763 
8764 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
8765   match(Set dst (AddVB (Binary dst src2) mask));
8766   match(Set dst (AddVS (Binary dst src2) mask));
8767   match(Set dst (AddVI (Binary dst src2) mask));
8768   match(Set dst (AddVL (Binary dst src2) mask));
8769   match(Set dst (AddVF (Binary dst src2) mask));
8770   match(Set dst (AddVD (Binary dst src2) mask));
8771   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
8772   ins_encode %{
8773     int vlen_enc = vector_length_encoding(this);
8774     BasicType bt = Matcher::vector_element_basic_type(this);
8775     int opc = this->ideal_Opcode();
8776     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8777                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8778   %}
8779   ins_pipe( pipe_slow );
8780 %}
8781 
8782 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
8783   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
8784   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
8785   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
8786   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
8787   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
8788   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
8789   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
8790   ins_encode %{
8791     int vlen_enc = vector_length_encoding(this);
8792     BasicType bt = Matcher::vector_element_basic_type(this);
8793     int opc = this->ideal_Opcode();
8794     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8795                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8796   %}
8797   ins_pipe( pipe_slow );
8798 %}
8799 
8800 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
8801   match(Set dst (XorV (Binary dst src2) mask));
8802   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
8803   ins_encode %{
8804     int vlen_enc = vector_length_encoding(this);
8805     BasicType bt = Matcher::vector_element_basic_type(this);
8806     int opc = this->ideal_Opcode();
8807     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8808                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8809   %}
8810   ins_pipe( pipe_slow );
8811 %}
8812 
8813 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
8814   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
8815   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
8816   ins_encode %{
8817     int vlen_enc = vector_length_encoding(this);
8818     BasicType bt = Matcher::vector_element_basic_type(this);
8819     int opc = this->ideal_Opcode();
8820     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8821                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8822   %}
8823   ins_pipe( pipe_slow );
8824 %}
8825 
8826 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
8827   match(Set dst (OrV (Binary dst src2) mask));
8828   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
8829   ins_encode %{
8830     int vlen_enc = vector_length_encoding(this);
8831     BasicType bt = Matcher::vector_element_basic_type(this);
8832     int opc = this->ideal_Opcode();
8833     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8834                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8835   %}
8836   ins_pipe( pipe_slow );
8837 %}
8838 
8839 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
8840   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
8841   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
8842   ins_encode %{
8843     int vlen_enc = vector_length_encoding(this);
8844     BasicType bt = Matcher::vector_element_basic_type(this);
8845     int opc = this->ideal_Opcode();
8846     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8847                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8848   %}
8849   ins_pipe( pipe_slow );
8850 %}
8851 
8852 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
8853   match(Set dst (AndV (Binary dst src2) mask));
8854   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
8855   ins_encode %{
8856     int vlen_enc = vector_length_encoding(this);
8857     BasicType bt = Matcher::vector_element_basic_type(this);
8858     int opc = this->ideal_Opcode();
8859     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8860                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8861   %}
8862   ins_pipe( pipe_slow );
8863 %}
8864 
8865 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
8866   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
8867   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
8868   ins_encode %{
8869     int vlen_enc = vector_length_encoding(this);
8870     BasicType bt = Matcher::vector_element_basic_type(this);
8871     int opc = this->ideal_Opcode();
8872     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8873                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8874   %}
8875   ins_pipe( pipe_slow );
8876 %}
8877 
8878 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
8879   match(Set dst (SubVB (Binary dst src2) mask));
8880   match(Set dst (SubVS (Binary dst src2) mask));
8881   match(Set dst (SubVI (Binary dst src2) mask));
8882   match(Set dst (SubVL (Binary dst src2) mask));
8883   match(Set dst (SubVF (Binary dst src2) mask));
8884   match(Set dst (SubVD (Binary dst src2) mask));
8885   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
8886   ins_encode %{
8887     int vlen_enc = vector_length_encoding(this);
8888     BasicType bt = Matcher::vector_element_basic_type(this);
8889     int opc = this->ideal_Opcode();
8890     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8891                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8892   %}
8893   ins_pipe( pipe_slow );
8894 %}
8895 
8896 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
8897   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
8898   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
8899   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
8900   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
8901   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
8902   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
8903   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
8904   ins_encode %{
8905     int vlen_enc = vector_length_encoding(this);
8906     BasicType bt = Matcher::vector_element_basic_type(this);
8907     int opc = this->ideal_Opcode();
8908     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8909                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8910   %}
8911   ins_pipe( pipe_slow );
8912 %}
8913 
8914 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
8915   match(Set dst (MulVS (Binary dst src2) mask));
8916   match(Set dst (MulVI (Binary dst src2) mask));
8917   match(Set dst (MulVL (Binary dst src2) mask));
8918   match(Set dst (MulVF (Binary dst src2) mask));
8919   match(Set dst (MulVD (Binary dst src2) mask));
8920   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
8921   ins_encode %{
8922     int vlen_enc = vector_length_encoding(this);
8923     BasicType bt = Matcher::vector_element_basic_type(this);
8924     int opc = this->ideal_Opcode();
8925     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8926                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8927   %}
8928   ins_pipe( pipe_slow );
8929 %}
8930 
8931 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
8932   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
8933   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
8934   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
8935   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
8936   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
8937   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
8938   ins_encode %{
8939     int vlen_enc = vector_length_encoding(this);
8940     BasicType bt = Matcher::vector_element_basic_type(this);
8941     int opc = this->ideal_Opcode();
8942     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8943                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8944   %}
8945   ins_pipe( pipe_slow );
8946 %}
8947 
8948 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
8949   match(Set dst (SqrtVF dst mask));
8950   match(Set dst (SqrtVD dst mask));
8951   ins_cost(100);
8952   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
8953   ins_encode %{
8954     int vlen_enc = vector_length_encoding(this);
8955     BasicType bt = Matcher::vector_element_basic_type(this);
8956     int opc = this->ideal_Opcode();
8957     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8958                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
8959   %}
8960   ins_pipe( pipe_slow );
8961 %}
8962 
8963 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
8964   match(Set dst (DivVF (Binary dst src2) mask));
8965   match(Set dst (DivVD (Binary dst src2) mask));
8966   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
8967   ins_encode %{
8968     int vlen_enc = vector_length_encoding(this);
8969     BasicType bt = Matcher::vector_element_basic_type(this);
8970     int opc = this->ideal_Opcode();
8971     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8972                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8973   %}
8974   ins_pipe( pipe_slow );
8975 %}
8976 
8977 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
8978   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
8979   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
8980   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
8981   ins_encode %{
8982     int vlen_enc = vector_length_encoding(this);
8983     BasicType bt = Matcher::vector_element_basic_type(this);
8984     int opc = this->ideal_Opcode();
8985     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8986                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8987   %}
8988   ins_pipe( pipe_slow );
8989 %}
8990 
8991 
8992 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
8993   match(Set dst (RotateLeftV (Binary dst shift) mask));
8994   match(Set dst (RotateRightV (Binary dst shift) mask));
8995   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
8996   ins_encode %{
8997     int vlen_enc = vector_length_encoding(this);
8998     BasicType bt = Matcher::vector_element_basic_type(this);
8999     int opc = this->ideal_Opcode();
9000     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9001                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9002   %}
9003   ins_pipe( pipe_slow );
9004 %}
9005 
9006 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
9007   match(Set dst (RotateLeftV (Binary dst src2) mask));
9008   match(Set dst (RotateRightV (Binary dst src2) mask));
9009   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
9010   ins_encode %{
9011     int vlen_enc = vector_length_encoding(this);
9012     BasicType bt = Matcher::vector_element_basic_type(this);
9013     int opc = this->ideal_Opcode();
9014     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9015                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9016   %}
9017   ins_pipe( pipe_slow );
9018 %}
9019 
9020 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9021   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
9022   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
9023   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
9024   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
9025   ins_encode %{
9026     int vlen_enc = vector_length_encoding(this);
9027     BasicType bt = Matcher::vector_element_basic_type(this);
9028     int opc = this->ideal_Opcode();
9029     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9030                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9031   %}
9032   ins_pipe( pipe_slow );
9033 %}
9034 
9035 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
9036   match(Set dst (LShiftVS (Binary dst src2) mask));
9037   match(Set dst (LShiftVI (Binary dst src2) mask));
9038   match(Set dst (LShiftVL (Binary dst src2) mask));
9039   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
9040   ins_encode %{
9041     int vlen_enc = vector_length_encoding(this);
9042     BasicType bt = Matcher::vector_element_basic_type(this);
9043     int opc = this->ideal_Opcode();
9044     bool is_varshift = !VectorNode::is_vshift_cnt_opcode(in(2)->isa_Mach()->ideal_Opcode());
9045     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9046                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, is_varshift);
9047   %}
9048   ins_pipe( pipe_slow );
9049 %}
9050 
9051 instruct vlshift_mem_masked(vec dst, memory src2, kReg mask) %{
9052   match(Set dst (LShiftVS (Binary dst (LoadVector src2)) mask));
9053   match(Set dst (LShiftVI (Binary dst (LoadVector src2)) mask));
9054   match(Set dst (LShiftVL (Binary dst (LoadVector src2)) mask));
9055   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
9056   ins_encode %{
9057     int vlen_enc = vector_length_encoding(this);
9058     BasicType bt = Matcher::vector_element_basic_type(this);
9059     int opc = this->ideal_Opcode();
9060     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9061                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9062   %}
9063   ins_pipe( pipe_slow );
9064 %}
9065 
9066 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9067   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
9068   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
9069   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
9070   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
9071   ins_encode %{
9072     int vlen_enc = vector_length_encoding(this);
9073     BasicType bt = Matcher::vector_element_basic_type(this);
9074     int opc = this->ideal_Opcode();
9075     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9076                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9077   %}
9078   ins_pipe( pipe_slow );
9079 %}
9080 
9081 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
9082   match(Set dst (RShiftVS (Binary dst src2) mask));
9083   match(Set dst (RShiftVI (Binary dst src2) mask));
9084   match(Set dst (RShiftVL (Binary dst src2) mask));
9085   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
9086   ins_encode %{
9087     int vlen_enc = vector_length_encoding(this);
9088     BasicType bt = Matcher::vector_element_basic_type(this);
9089     int opc = this->ideal_Opcode();
9090     bool is_varshift = !VectorNode::is_vshift_cnt_opcode(in(2)->isa_Mach()->ideal_Opcode());
9091     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9092                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, is_varshift);
9093   %}
9094   ins_pipe( pipe_slow );
9095 %}
9096 
9097 instruct vrshift_mem_masked(vec dst, memory src2, kReg mask) %{
9098   match(Set dst (RShiftVS (Binary dst (LoadVector src2)) mask));
9099   match(Set dst (RShiftVI (Binary dst (LoadVector src2)) mask));
9100   match(Set dst (RShiftVL (Binary dst (LoadVector src2)) mask));
9101   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
9102   ins_encode %{
9103     int vlen_enc = vector_length_encoding(this);
9104     BasicType bt = Matcher::vector_element_basic_type(this);
9105     int opc = this->ideal_Opcode();
9106     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9107                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9108   %}
9109   ins_pipe( pipe_slow );
9110 %}
9111 
9112 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9113   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
9114   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
9115   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
9116   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
9117   ins_encode %{
9118     int vlen_enc = vector_length_encoding(this);
9119     BasicType bt = Matcher::vector_element_basic_type(this);
9120     int opc = this->ideal_Opcode();
9121     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9122                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9123   %}
9124   ins_pipe( pipe_slow );
9125 %}
9126 
9127 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
9128   match(Set dst (URShiftVS (Binary dst src2) mask));
9129   match(Set dst (URShiftVI (Binary dst src2) mask));
9130   match(Set dst (URShiftVL (Binary dst src2) mask));
9131   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
9132   ins_encode %{
9133     int vlen_enc = vector_length_encoding(this);
9134     BasicType bt = Matcher::vector_element_basic_type(this);
9135     int opc = this->ideal_Opcode();
9136     bool is_varshift = !VectorNode::is_vshift_cnt_opcode(in(2)->isa_Mach()->ideal_Opcode());
9137     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9138                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, is_varshift);
9139   %}
9140   ins_pipe( pipe_slow );
9141 %}
9142 
9143 instruct vurshift_mem_masked(vec dst, memory src2, kReg mask) %{
9144   match(Set dst (URShiftVS (Binary dst (LoadVector src2)) mask));
9145   match(Set dst (URShiftVI (Binary dst (LoadVector src2)) mask));
9146   match(Set dst (URShiftVL (Binary dst (LoadVector src2)) mask));
9147   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
9148   ins_encode %{
9149     int vlen_enc = vector_length_encoding(this);
9150     BasicType bt = Matcher::vector_element_basic_type(this);
9151     int opc = this->ideal_Opcode();
9152     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9153                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9154   %}
9155   ins_pipe( pipe_slow );
9156 %}
9157 
9158 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
9159   match(Set dst (MaxV (Binary dst src2) mask));
9160   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
9161   ins_encode %{
9162     int vlen_enc = vector_length_encoding(this);
9163     BasicType bt = Matcher::vector_element_basic_type(this);
9164     int opc = this->ideal_Opcode();
9165     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9166                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9167   %}
9168   ins_pipe( pipe_slow );
9169 %}
9170 
9171 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
9172   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
9173   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
9174   ins_encode %{
9175     int vlen_enc = vector_length_encoding(this);
9176     BasicType bt = Matcher::vector_element_basic_type(this);
9177     int opc = this->ideal_Opcode();
9178     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9179                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9180   %}
9181   ins_pipe( pipe_slow );
9182 %}
9183 
9184 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
9185   match(Set dst (MinV (Binary dst src2) mask));
9186   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
9187   ins_encode %{
9188     int vlen_enc = vector_length_encoding(this);
9189     BasicType bt = Matcher::vector_element_basic_type(this);
9190     int opc = this->ideal_Opcode();
9191     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9192                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9193   %}
9194   ins_pipe( pipe_slow );
9195 %}
9196 
9197 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
9198   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
9199   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
9200   ins_encode %{
9201     int vlen_enc = vector_length_encoding(this);
9202     BasicType bt = Matcher::vector_element_basic_type(this);
9203     int opc = this->ideal_Opcode();
9204     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9205                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9206   %}
9207   ins_pipe( pipe_slow );
9208 %}
9209 
9210 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
9211   match(Set dst (VectorRearrange (Binary dst src2) mask));
9212   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
9213   ins_encode %{
9214     int vlen_enc = vector_length_encoding(this);
9215     BasicType bt = Matcher::vector_element_basic_type(this);
9216     int opc = this->ideal_Opcode();
9217     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9218                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
9219   %}
9220   ins_pipe( pipe_slow );
9221 %}
9222 
9223 instruct vabs_masked(vec dst, kReg mask) %{
9224   match(Set dst (AbsVB dst mask));
9225   match(Set dst (AbsVS dst mask));
9226   match(Set dst (AbsVI dst mask));
9227   match(Set dst (AbsVL dst mask));
9228   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
9229   ins_cost(100);
9230   ins_encode %{
9231     int vlen_enc = vector_length_encoding(this);
9232     BasicType bt = Matcher::vector_element_basic_type(this);
9233     int opc = this->ideal_Opcode();
9234     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9235                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
9236   %}
9237   ins_pipe( pipe_slow );
9238 %}
9239 
9240 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
9241   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
9242   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
9243   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
9244   ins_encode %{
9245     int vlen_enc = vector_length_encoding(this);
9246     BasicType bt = Matcher::vector_element_basic_type(this);
9247     int opc = this->ideal_Opcode();
9248     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9249                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
9250   %}
9251   ins_pipe( pipe_slow );
9252 %}
9253 
9254 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
9255   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
9256   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
9257   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
9258   ins_encode %{
9259     int vlen_enc = vector_length_encoding(this);
9260     BasicType bt = Matcher::vector_element_basic_type(this);
9261     int opc = this->ideal_Opcode();
9262     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9263                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
9264   %}
9265   ins_pipe( pipe_slow );
9266 %}
9267 
9268 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask, rRegP scratch) %{
9269   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
9270   effect(TEMP scratch);
9271   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask\t! using $scratch as TEMP" %}
9272   ins_encode %{
9273     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
9274     int vlen_enc = vector_length_encoding(this, $src1);
9275     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
9276 
9277     // Comparison i
9278     switch (src1_elem_bt) {
9279       case T_BYTE: {
9280         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9281         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9282         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9283         break;
9284       }
9285       case T_SHORT: {
9286         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9287         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9288         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9289         break;
9290       }
9291       case T_INT: {
9292         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9293         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9294         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9295         break;
9296       }
9297       case T_LONG: {
9298         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9299         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9300         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9301         break;
9302       }
9303       case T_FLOAT: {
9304         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
9305         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
9306         break;
9307       }
9308       case T_DOUBLE: {
9309         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
9310         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
9311         break;
9312       }
9313       default: assert(false, "%s", type2name(src1_elem_bt)); break;
9314     }
9315   %}
9316   ins_pipe( pipe_slow );
9317 %}
9318 
9319 #ifdef _LP64
9320 instruct mask_all_evexI_imm(kReg dst, immI cnt, rRegL tmp) %{
9321   match(Set dst (MaskAll cnt));
9322   effect(TEMP_DEF dst, TEMP tmp);
9323   format %{ "mask_all_evexI $dst, $cnt \t! using $tmp as TEMP" %}
9324   ins_encode %{
9325     int vec_len = Matcher::vector_length(this);
9326     if (VM_Version::supports_avx512bw()) {
9327       __ movq($tmp$$Register, $cnt$$constant);
9328       __ kmovql($dst$$KRegister, $tmp$$Register);
9329       __ kshiftrql($dst$$KRegister, $dst$$KRegister, 64 - vec_len);
9330     } else {
9331       assert(vec_len <= 16, "");
9332       __ movq($tmp$$Register, $cnt$$constant);
9333       __ kmovwl($dst$$KRegister, $tmp$$Register);
9334       __ kshiftrwl($dst$$KRegister, $dst$$KRegister, 16 - vec_len);
9335     }
9336   %}
9337   ins_pipe( pipe_slow );
9338 %}
9339 
9340 instruct mask_all_evexI(kReg dst, rRegI src, rRegL tmp) %{
9341   match(Set dst (MaskAll src));
9342   effect(TEMP_DEF dst, TEMP tmp);
9343   format %{ "mask_all_evexI $dst, $src \t! using $tmp as TEMP" %}
9344   ins_encode %{
9345     int vec_len = Matcher::vector_length(this);
9346     if (VM_Version::supports_avx512bw()) {
9347       __ movslq($tmp$$Register, $src$$Register);
9348       __ kmovql($dst$$KRegister, $tmp$$Register);
9349       __ kshiftrql($dst$$KRegister, $dst$$KRegister, 64 - vec_len);
9350     } else {
9351       assert(vec_len <= 16, "");
9352       __ kmovwl($dst$$KRegister, $src$$Register);
9353       __ kshiftrwl($dst$$KRegister, $dst$$KRegister, 16 - vec_len);
9354     }
9355   %}
9356   ins_pipe( pipe_slow );
9357 %}
9358 
9359 instruct mask_all_evexL(kReg dst, rRegL src) %{
9360   match(Set dst (MaskAll src));
9361   effect(TEMP_DEF dst);
9362   format %{ "mask_all_evexL $dst, $src \t! mask all operation" %}
9363   ins_encode %{
9364     int vec_len = Matcher::vector_length(this);
9365     if (VM_Version::supports_avx512bw()) {
9366       __ kmovql($dst$$KRegister, $src$$Register);
9367       __ kshiftrql($dst$$KRegister, $dst$$KRegister, 64 - vec_len);
9368     } else {
9369       assert(vec_len <= 16, "");
9370       __ kmovwl($dst$$KRegister, $src$$Register);
9371       __ kshiftrwl($dst$$KRegister, $dst$$KRegister, 16 - vec_len);
9372     }
9373   %}
9374   ins_pipe( pipe_slow );
9375 %}
9376 
9377 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
9378   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
9379   match(Set dst (XorVMask src (MaskAll cnt)));
9380   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
9381   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
9382   ins_encode %{
9383     uint masklen = Matcher::vector_length(this);
9384     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
9385   %}
9386   ins_pipe( pipe_slow );
9387 %}
9388 
9389 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
9390   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
9391             (Matcher::vector_length(n) == 16) ||
9392             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
9393   match(Set dst (XorVMask src (MaskAll cnt)));
9394   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
9395   ins_encode %{
9396     uint masklen = Matcher::vector_length(this);
9397     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
9398   %}
9399   ins_pipe( pipe_slow );
9400 %}
9401 #endif
9402 
9403 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
9404   match(Set dst (AndVMask src1 src2));
9405   match(Set dst (OrVMask src1 src2));
9406   match(Set dst (XorVMask src1 src2));
9407   effect(TEMP kscratch);
9408   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
9409   ins_encode %{
9410     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
9411     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
9412     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
9413     uint masklen = Matcher::vector_length(this);
9414     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
9415     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
9416   %}
9417   ins_pipe( pipe_slow );
9418 %}
9419 
9420 instruct castMM(kReg dst)
9421 %{
9422   match(Set dst (CastVV dst));
9423 
9424   size(0);
9425   format %{ "# castVV of $dst" %}
9426   ins_encode(/* empty encoding */);
9427   ins_cost(0);
9428   ins_pipe(empty);
9429 %}
9430 
9431 instruct castVV(vec dst)
9432 %{
9433   match(Set dst (CastVV dst));
9434 
9435   size(0);
9436   format %{ "# castVV of $dst" %}
9437   ins_encode(/* empty encoding */);
9438   ins_cost(0);
9439   ins_pipe(empty);
9440 %}
9441 
9442 instruct castVVLeg(legVec dst)
9443 %{
9444   match(Set dst (CastVV dst));
9445 
9446   size(0);
9447   format %{ "# castVV of $dst" %}
9448   ins_encode(/* empty encoding */);
9449   ins_cost(0);
9450   ins_pipe(empty);
9451 %}
--- EOF ---