1 //
   2 // Copyright (c) 2011, 2021, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 // AVX3 Mask Registers.
 632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
 633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
 634 
 635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
 636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
 637 
 638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
 639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
 640 
 641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
 642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
 643 
 644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
 645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
 646 
 647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
 648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
 649 
 650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
 651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
 652 
 653 
 654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 662 #ifdef _LP64
 663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 687 #endif
 688                       );
 689 
 690 alloc_class chunk2(K7, K7_H,
 691                    K6, K6_H,
 692                    K5, K5_H,
 693                    K4, K4_H,
 694                    K3, K3_H,
 695                    K2, K2_H,
 696                    K1, K1_H);
 697 
 698 reg_class  vectmask_reg(K1, K1_H,
 699                         K2, K2_H,
 700                         K3, K3_H,
 701                         K4, K4_H,
 702                         K5, K5_H,
 703                         K6, K6_H,
 704                         K7, K7_H);
 705 
 706 reg_class vectmask_reg_K1(K1, K1_H);
 707 reg_class vectmask_reg_K2(K2, K2_H);
 708 reg_class vectmask_reg_K3(K3, K3_H);
 709 reg_class vectmask_reg_K4(K4, K4_H);
 710 reg_class vectmask_reg_K5(K5, K5_H);
 711 reg_class vectmask_reg_K6(K6, K6_H);
 712 reg_class vectmask_reg_K7(K7, K7_H);
 713 
 714 // flags allocation class should be last.
 715 alloc_class chunk3(RFLAGS);
 716 
 717 
 718 // Singleton class for condition codes
 719 reg_class int_flags(RFLAGS);
 720 
 721 // Class for pre evex float registers
 722 reg_class float_reg_legacy(XMM0,
 723                     XMM1,
 724                     XMM2,
 725                     XMM3,
 726                     XMM4,
 727                     XMM5,
 728                     XMM6,
 729                     XMM7
 730 #ifdef _LP64
 731                    ,XMM8,
 732                     XMM9,
 733                     XMM10,
 734                     XMM11,
 735                     XMM12,
 736                     XMM13,
 737                     XMM14,
 738                     XMM15
 739 #endif
 740                     );
 741 
 742 // Class for evex float registers
 743 reg_class float_reg_evex(XMM0,
 744                     XMM1,
 745                     XMM2,
 746                     XMM3,
 747                     XMM4,
 748                     XMM5,
 749                     XMM6,
 750                     XMM7
 751 #ifdef _LP64
 752                    ,XMM8,
 753                     XMM9,
 754                     XMM10,
 755                     XMM11,
 756                     XMM12,
 757                     XMM13,
 758                     XMM14,
 759                     XMM15,
 760                     XMM16,
 761                     XMM17,
 762                     XMM18,
 763                     XMM19,
 764                     XMM20,
 765                     XMM21,
 766                     XMM22,
 767                     XMM23,
 768                     XMM24,
 769                     XMM25,
 770                     XMM26,
 771                     XMM27,
 772                     XMM28,
 773                     XMM29,
 774                     XMM30,
 775                     XMM31
 776 #endif
 777                     );
 778 
 779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 781 
 782 // Class for pre evex double registers
 783 reg_class double_reg_legacy(XMM0,  XMM0b,
 784                      XMM1,  XMM1b,
 785                      XMM2,  XMM2b,
 786                      XMM3,  XMM3b,
 787                      XMM4,  XMM4b,
 788                      XMM5,  XMM5b,
 789                      XMM6,  XMM6b,
 790                      XMM7,  XMM7b
 791 #ifdef _LP64
 792                     ,XMM8,  XMM8b,
 793                      XMM9,  XMM9b,
 794                      XMM10, XMM10b,
 795                      XMM11, XMM11b,
 796                      XMM12, XMM12b,
 797                      XMM13, XMM13b,
 798                      XMM14, XMM14b,
 799                      XMM15, XMM15b
 800 #endif
 801                      );
 802 
 803 // Class for evex double registers
 804 reg_class double_reg_evex(XMM0,  XMM0b,
 805                      XMM1,  XMM1b,
 806                      XMM2,  XMM2b,
 807                      XMM3,  XMM3b,
 808                      XMM4,  XMM4b,
 809                      XMM5,  XMM5b,
 810                      XMM6,  XMM6b,
 811                      XMM7,  XMM7b
 812 #ifdef _LP64
 813                     ,XMM8,  XMM8b,
 814                      XMM9,  XMM9b,
 815                      XMM10, XMM10b,
 816                      XMM11, XMM11b,
 817                      XMM12, XMM12b,
 818                      XMM13, XMM13b,
 819                      XMM14, XMM14b,
 820                      XMM15, XMM15b,
 821                      XMM16, XMM16b,
 822                      XMM17, XMM17b,
 823                      XMM18, XMM18b,
 824                      XMM19, XMM19b,
 825                      XMM20, XMM20b,
 826                      XMM21, XMM21b,
 827                      XMM22, XMM22b,
 828                      XMM23, XMM23b,
 829                      XMM24, XMM24b,
 830                      XMM25, XMM25b,
 831                      XMM26, XMM26b,
 832                      XMM27, XMM27b,
 833                      XMM28, XMM28b,
 834                      XMM29, XMM29b,
 835                      XMM30, XMM30b,
 836                      XMM31, XMM31b
 837 #endif
 838                      );
 839 
 840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 842 
 843 // Class for pre evex 32bit vector registers
 844 reg_class vectors_reg_legacy(XMM0,
 845                       XMM1,
 846                       XMM2,
 847                       XMM3,
 848                       XMM4,
 849                       XMM5,
 850                       XMM6,
 851                       XMM7
 852 #ifdef _LP64
 853                      ,XMM8,
 854                       XMM9,
 855                       XMM10,
 856                       XMM11,
 857                       XMM12,
 858                       XMM13,
 859                       XMM14,
 860                       XMM15
 861 #endif
 862                       );
 863 
 864 // Class for evex 32bit vector registers
 865 reg_class vectors_reg_evex(XMM0,
 866                       XMM1,
 867                       XMM2,
 868                       XMM3,
 869                       XMM4,
 870                       XMM5,
 871                       XMM6,
 872                       XMM7
 873 #ifdef _LP64
 874                      ,XMM8,
 875                       XMM9,
 876                       XMM10,
 877                       XMM11,
 878                       XMM12,
 879                       XMM13,
 880                       XMM14,
 881                       XMM15,
 882                       XMM16,
 883                       XMM17,
 884                       XMM18,
 885                       XMM19,
 886                       XMM20,
 887                       XMM21,
 888                       XMM22,
 889                       XMM23,
 890                       XMM24,
 891                       XMM25,
 892                       XMM26,
 893                       XMM27,
 894                       XMM28,
 895                       XMM29,
 896                       XMM30,
 897                       XMM31
 898 #endif
 899                       );
 900 
 901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 903 
 904 // Class for all 64bit vector registers
 905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 906                       XMM1,  XMM1b,
 907                       XMM2,  XMM2b,
 908                       XMM3,  XMM3b,
 909                       XMM4,  XMM4b,
 910                       XMM5,  XMM5b,
 911                       XMM6,  XMM6b,
 912                       XMM7,  XMM7b
 913 #ifdef _LP64
 914                      ,XMM8,  XMM8b,
 915                       XMM9,  XMM9b,
 916                       XMM10, XMM10b,
 917                       XMM11, XMM11b,
 918                       XMM12, XMM12b,
 919                       XMM13, XMM13b,
 920                       XMM14, XMM14b,
 921                       XMM15, XMM15b
 922 #endif
 923                       );
 924 
 925 // Class for all 64bit vector registers
 926 reg_class vectord_reg_evex(XMM0,  XMM0b,
 927                       XMM1,  XMM1b,
 928                       XMM2,  XMM2b,
 929                       XMM3,  XMM3b,
 930                       XMM4,  XMM4b,
 931                       XMM5,  XMM5b,
 932                       XMM6,  XMM6b,
 933                       XMM7,  XMM7b
 934 #ifdef _LP64
 935                      ,XMM8,  XMM8b,
 936                       XMM9,  XMM9b,
 937                       XMM10, XMM10b,
 938                       XMM11, XMM11b,
 939                       XMM12, XMM12b,
 940                       XMM13, XMM13b,
 941                       XMM14, XMM14b,
 942                       XMM15, XMM15b,
 943                       XMM16, XMM16b,
 944                       XMM17, XMM17b,
 945                       XMM18, XMM18b,
 946                       XMM19, XMM19b,
 947                       XMM20, XMM20b,
 948                       XMM21, XMM21b,
 949                       XMM22, XMM22b,
 950                       XMM23, XMM23b,
 951                       XMM24, XMM24b,
 952                       XMM25, XMM25b,
 953                       XMM26, XMM26b,
 954                       XMM27, XMM27b,
 955                       XMM28, XMM28b,
 956                       XMM29, XMM29b,
 957                       XMM30, XMM30b,
 958                       XMM31, XMM31b
 959 #endif
 960                       );
 961 
 962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 964 
 965 // Class for all 128bit vector registers
 966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 973                       XMM7,  XMM7b,  XMM7c,  XMM7d
 974 #ifdef _LP64
 975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 977                       XMM10, XMM10b, XMM10c, XMM10d,
 978                       XMM11, XMM11b, XMM11c, XMM11d,
 979                       XMM12, XMM12b, XMM12c, XMM12d,
 980                       XMM13, XMM13b, XMM13c, XMM13d,
 981                       XMM14, XMM14b, XMM14c, XMM14d,
 982                       XMM15, XMM15b, XMM15c, XMM15d
 983 #endif
 984                       );
 985 
 986 // Class for all 128bit vector registers
 987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 994                       XMM7,  XMM7b,  XMM7c,  XMM7d
 995 #ifdef _LP64
 996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 998                       XMM10, XMM10b, XMM10c, XMM10d,
 999                       XMM11, XMM11b, XMM11c, XMM11d,
1000                       XMM12, XMM12b, XMM12c, XMM12d,
1001                       XMM13, XMM13b, XMM13c, XMM13d,
1002                       XMM14, XMM14b, XMM14c, XMM14d,
1003                       XMM15, XMM15b, XMM15c, XMM15d,
1004                       XMM16, XMM16b, XMM16c, XMM16d,
1005                       XMM17, XMM17b, XMM17c, XMM17d,
1006                       XMM18, XMM18b, XMM18c, XMM18d,
1007                       XMM19, XMM19b, XMM19c, XMM19d,
1008                       XMM20, XMM20b, XMM20c, XMM20d,
1009                       XMM21, XMM21b, XMM21c, XMM21d,
1010                       XMM22, XMM22b, XMM22c, XMM22d,
1011                       XMM23, XMM23b, XMM23c, XMM23d,
1012                       XMM24, XMM24b, XMM24c, XMM24d,
1013                       XMM25, XMM25b, XMM25c, XMM25d,
1014                       XMM26, XMM26b, XMM26c, XMM26d,
1015                       XMM27, XMM27b, XMM27c, XMM27d,
1016                       XMM28, XMM28b, XMM28c, XMM28d,
1017                       XMM29, XMM29b, XMM29c, XMM29d,
1018                       XMM30, XMM30b, XMM30c, XMM30d,
1019                       XMM31, XMM31b, XMM31c, XMM31d
1020 #endif
1021                       );
1022 
1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1025 
1026 // Class for all 256bit vector registers
1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1035 #ifdef _LP64
1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
1044 #endif
1045                       );
1046 
1047 // Class for all 256bit vector registers
1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1056 #ifdef _LP64
1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1081 #endif
1082                       );
1083 
1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1086 
1087 // Class for all 512bit vector registers
1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1096 #ifdef _LP64
1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1121 #endif
1122                       );
1123 
1124 // Class for restricted 512bit vector registers
1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1133 #ifdef _LP64
1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1142 #endif
1143                       );
1144 
1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1147 
1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1149 %}
1150 
1151 
1152 //----------SOURCE BLOCK-------------------------------------------------------
1153 // This is a block of C++ code which provides values, functions, and
1154 // definitions necessary in the rest of the architecture description
1155 
1156 source_hpp %{
1157 // Header information of the source block.
1158 // Method declarations/definitions which are used outside
1159 // the ad-scope can conveniently be defined here.
1160 //
1161 // To keep related declarations/definitions/uses close together,
1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1163 
1164 #include "runtime/vm_version.hpp"
1165 
1166 class NativeJump;
1167 
1168 class CallStubImpl {
1169 
1170   //--------------------------------------------------------------
1171   //---<  Used for optimization in Compile::shorten_branches  >---
1172   //--------------------------------------------------------------
1173 
1174  public:
1175   // Size of call trampoline stub.
1176   static uint size_call_trampoline() {
1177     return 0; // no call trampolines on this platform
1178   }
1179 
1180   // number of relocations needed by a call trampoline stub
1181   static uint reloc_call_trampoline() {
1182     return 0; // no call trampolines on this platform
1183   }
1184 };
1185 
1186 class HandlerImpl {
1187 
1188  public:
1189 
1190   static int emit_exception_handler(CodeBuffer &cbuf);
1191   static int emit_deopt_handler(CodeBuffer& cbuf);
1192 
1193   static uint size_exception_handler() {
1194     // NativeCall instruction size is the same as NativeJump.
1195     // exception handler starts out as jump and can be patched to
1196     // a call be deoptimization.  (4932387)
1197     // Note that this value is also credited (in output.cpp) to
1198     // the size of the code section.
1199     return NativeJump::instruction_size;
1200   }
1201 
1202 #ifdef _LP64
1203   static uint size_deopt_handler() {
1204     // three 5 byte instructions plus one move for unreachable address.
1205     return 15+3;
1206   }
1207 #else
1208   static uint size_deopt_handler() {
1209     // NativeCall instruction size is the same as NativeJump.
1210     // exception handler starts out as jump and can be patched to
1211     // a call be deoptimization.  (4932387)
1212     // Note that this value is also credited (in output.cpp) to
1213     // the size of the code section.
1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1215   }
1216 #endif
1217 };
1218 
1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
1220   switch(bytes) {
1221     case  4: // fall-through
1222     case  8: // fall-through
1223     case 16: return Assembler::AVX_128bit;
1224     case 32: return Assembler::AVX_256bit;
1225     case 64: return Assembler::AVX_512bit;
1226 
1227     default: {
1228       ShouldNotReachHere();
1229       return Assembler::AVX_NoVec;
1230     }
1231   }
1232 }
1233 
1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
1236 }
1237 
1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
1239   uint def_idx = use->operand_index(opnd);
1240   Node* def = use->in(def_idx);
1241   return vector_length_encoding(def);
1242 }
1243 
1244 static inline bool is_unsigned_booltest_pred(int bt) {
1245   return  ((bt & BoolTest::unsigned_compare) == BoolTest::unsigned_compare);
1246 }
1247 
1248 class Node::PD {
1249 public:
1250   enum NodeFlags {
1251     Flag_intel_jcc_erratum = Node::_last_flag << 1,
1252     _last_flag             = Flag_intel_jcc_erratum
1253   };
1254 };
1255 
1256 %} // end source_hpp
1257 
1258 source %{
1259 
1260 #include "opto/addnode.hpp"
1261 #include "c2_intelJccErratum_x86.hpp"
1262 
1263 void PhaseOutput::pd_perform_mach_node_analysis() {
1264   if (VM_Version::has_intel_jcc_erratum()) {
1265     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
1266     _buf_sizes._code += extra_padding;
1267   }
1268 }
1269 
1270 int MachNode::pd_alignment_required() const {
1271   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
1272     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
1273     return IntelJccErratum::largest_jcc_size() + 1;
1274   } else {
1275     return 1;
1276   }
1277 }
1278 
1279 int MachNode::compute_padding(int current_offset) const {
1280   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
1281     Compile* C = Compile::current();
1282     PhaseOutput* output = C->output();
1283     Block* block = output->block();
1284     int index = output->index();
1285     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
1286   } else {
1287     return 0;
1288   }
1289 }
1290 
1291 // Emit exception handler code.
1292 // Stuff framesize into a register and call a VM stub routine.
1293 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1294 
1295   // Note that the code buffer's insts_mark is always relative to insts.
1296   // That's why we must use the macroassembler to generate a handler.
1297   C2_MacroAssembler _masm(&cbuf);
1298   address base = __ start_a_stub(size_exception_handler());
1299   if (base == NULL) {
1300     ciEnv::current()->record_failure("CodeCache is full");
1301     return 0;  // CodeBuffer::expand failed
1302   }
1303   int offset = __ offset();
1304   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1305   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1306   __ end_a_stub();
1307   return offset;
1308 }
1309 
1310 // Emit deopt handler code.
1311 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1312 
1313   // Note that the code buffer's insts_mark is always relative to insts.
1314   // That's why we must use the macroassembler to generate a handler.
1315   C2_MacroAssembler _masm(&cbuf);
1316   address base = __ start_a_stub(size_deopt_handler());
1317   if (base == NULL) {
1318     ciEnv::current()->record_failure("CodeCache is full");
1319     return 0;  // CodeBuffer::expand failed
1320   }
1321   int offset = __ offset();
1322 
1323 #ifdef _LP64
1324   address the_pc = (address) __ pc();
1325   Label next;
1326   // push a "the_pc" on the stack without destroying any registers
1327   // as they all may be live.
1328 
1329   // push address of "next"
1330   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1331   __ bind(next);
1332   // adjust it so it matches "the_pc"
1333   __ subptr(Address(rsp, 0), __ offset() - offset);
1334 #else
1335   InternalAddress here(__ pc());
1336   __ pushptr(here.addr());
1337 #endif
1338 
1339   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1340   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1341   __ end_a_stub();
1342   return offset;
1343 }
1344 
1345 Assembler::Width widthForType(BasicType bt) {
1346   if (bt == T_BYTE) {
1347     return Assembler::B;
1348   } else if (bt == T_SHORT) {
1349     return Assembler::W;
1350   } else if (bt == T_INT) {
1351     return Assembler::D;
1352   } else {
1353     assert(bt == T_LONG, "not a long: %s", type2name(bt));
1354     return Assembler::Q;
1355   }
1356 }
1357 
1358 //=============================================================================
1359 
1360   // Float masks come from different places depending on platform.
1361 #ifdef _LP64
1362   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1363   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1364   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1365   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1366 #else
1367   static address float_signmask()  { return (address)float_signmask_pool; }
1368   static address float_signflip()  { return (address)float_signflip_pool; }
1369   static address double_signmask() { return (address)double_signmask_pool; }
1370   static address double_signflip() { return (address)double_signflip_pool; }
1371 #endif
1372   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1373   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1374   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1375   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1376   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1377   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
1378   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1379   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
1380   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1381   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1382   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1383   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1384   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1385 
1386 //=============================================================================
1387 const bool Matcher::match_rule_supported(int opcode) {
1388   if (!has_match_rule(opcode)) {
1389     return false; // no match rule present
1390   }
1391   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1392   switch (opcode) {
1393     case Op_AbsVL:
1394     case Op_StoreVectorScatter:
1395       if (UseAVX < 3) {
1396         return false;
1397       }
1398       break;
1399     case Op_PopCountI:
1400     case Op_PopCountL:
1401       if (!UsePopCountInstruction) {
1402         return false;
1403       }
1404       break;
1405     case Op_PopCountVI:
1406       if (!UsePopCountInstruction || !VM_Version::supports_avx512_vpopcntdq()) {
1407         return false;
1408       }
1409       break;
1410     case Op_MulVI:
1411       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
1412         return false;
1413       }
1414       break;
1415     case Op_MulVL:
1416       if (UseSSE < 4) { // only with SSE4_1 or AVX
1417         return false;
1418       }
1419       break;
1420     case Op_MulReductionVL:
1421       if (VM_Version::supports_avx512dq() == false) {
1422         return false;
1423       }
1424       break;
1425     case Op_AddReductionVL:
1426       if (UseSSE < 2) { // requires at least SSE2
1427         return false;
1428       }
1429       break;
1430     case Op_AbsVB:
1431     case Op_AbsVS:
1432     case Op_AbsVI:
1433     case Op_AddReductionVI:
1434     case Op_AndReductionV:
1435     case Op_OrReductionV:
1436     case Op_XorReductionV:
1437       if (UseSSE < 3) { // requires at least SSSE3
1438         return false;
1439       }
1440       break;
1441     case Op_VectorLoadShuffle:
1442     case Op_VectorRearrange:
1443     case Op_MulReductionVI:
1444       if (UseSSE < 4) { // requires at least SSE4
1445         return false;
1446       }
1447       break;
1448     case Op_SqrtVD:
1449     case Op_SqrtVF:
1450     case Op_VectorMaskCmp:
1451     case Op_VectorCastB2X:
1452     case Op_VectorCastS2X:
1453     case Op_VectorCastI2X:
1454     case Op_VectorCastL2X:
1455     case Op_VectorCastF2X:
1456     case Op_VectorCastD2X:
1457       if (UseAVX < 1) { // enabled for AVX only
1458         return false;
1459       }
1460       break;
1461     case Op_CompareAndSwapL:
1462 #ifdef _LP64
1463     case Op_CompareAndSwapP:
1464 #endif
1465       if (!VM_Version::supports_cx8()) {
1466         return false;
1467       }
1468       break;
1469     case Op_CMoveVF:
1470     case Op_CMoveVD:
1471       if (UseAVX < 1) { // enabled for AVX only
1472         return false;
1473       }
1474       break;
1475     case Op_StrIndexOf:
1476       if (!UseSSE42Intrinsics) {
1477         return false;
1478       }
1479       break;
1480     case Op_StrIndexOfChar:
1481       if (!UseSSE42Intrinsics) {
1482         return false;
1483       }
1484       break;
1485     case Op_OnSpinWait:
1486       if (VM_Version::supports_on_spin_wait() == false) {
1487         return false;
1488       }
1489       break;
1490     case Op_MulVB:
1491     case Op_LShiftVB:
1492     case Op_RShiftVB:
1493     case Op_URShiftVB:
1494     case Op_VectorInsert:
1495     case Op_VectorLoadMask:
1496     case Op_VectorStoreMask:
1497     case Op_VectorBlend:
1498       if (UseSSE < 4) {
1499         return false;
1500       }
1501       break;
1502 #ifdef _LP64
1503     case Op_MaxD:
1504     case Op_MaxF:
1505     case Op_MinD:
1506     case Op_MinF:
1507       if (UseAVX < 1) { // enabled for AVX only
1508         return false;
1509       }
1510       break;
1511 #endif
1512     case Op_CacheWB:
1513     case Op_CacheWBPreSync:
1514     case Op_CacheWBPostSync:
1515       if (!VM_Version::supports_data_cache_line_flush()) {
1516         return false;
1517       }
1518       break;
1519     case Op_ExtractB:
1520     case Op_ExtractL:
1521     case Op_ExtractI:
1522     case Op_RoundDoubleMode:
1523       if (UseSSE < 4) {
1524         return false;
1525       }
1526       break;
1527     case Op_RoundDoubleModeV:
1528       if (VM_Version::supports_avx() == false) {
1529         return false; // 128bit vroundpd is not available
1530       }
1531       break;
1532     case Op_LoadVectorGather:
1533       if (UseAVX < 2) {
1534         return false;
1535       }
1536       break;
1537     case Op_FmaVD:
1538     case Op_FmaVF:
1539       if (!UseFMA) {
1540         return false;
1541       }
1542       break;
1543     case Op_MacroLogicV:
1544       if (UseAVX < 3 || !UseVectorMacroLogic) {
1545         return false;
1546       }
1547       break;
1548 
1549     case Op_VectorCmpMasked:
1550     case Op_VectorMaskGen:
1551     case Op_LoadVectorMasked:
1552     case Op_StoreVectorMasked:
1553       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
1554         return false;
1555       }
1556       break;
1557     case Op_VectorMaskFirstTrue:
1558     case Op_VectorMaskLastTrue:
1559     case Op_VectorMaskTrueCount:
1560     case Op_VectorMaskToLong:
1561       if (!is_LP64 || UseAVX < 1) {
1562          return false;
1563       }
1564       break;
1565     case Op_CopySignD:
1566     case Op_CopySignF:
1567       if (UseAVX < 3 || !is_LP64)  {
1568         return false;
1569       }
1570       if (!VM_Version::supports_avx512vl()) {
1571         return false;
1572       }
1573       break;
1574 #ifndef _LP64
1575     case Op_AddReductionVF:
1576     case Op_AddReductionVD:
1577     case Op_MulReductionVF:
1578     case Op_MulReductionVD:
1579       if (UseSSE < 1) { // requires at least SSE
1580         return false;
1581       }
1582       break;
1583     case Op_MulAddVS2VI:
1584     case Op_RShiftVL:
1585     case Op_AbsVD:
1586     case Op_NegVD:
1587       if (UseSSE < 2) {
1588         return false;
1589       }
1590       break;
1591 #endif // !LP64
1592     case Op_SignumF:
1593       if (UseSSE < 1) {
1594         return false;
1595       }
1596       break;
1597     case Op_SignumD:
1598       if (UseSSE < 2) {
1599         return false;
1600       }
1601       break;
1602   }
1603   return true;  // Match rules are supported by default.
1604 }
1605 
1606 //------------------------------------------------------------------------
1607 
1608 // Identify extra cases that we might want to provide match rules for vector nodes and
1609 // other intrinsics guarded with vector length (vlen) and element type (bt).
1610 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
1611   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1612   if (!match_rule_supported(opcode)) {
1613     return false;
1614   }
1615   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
1616   //   * SSE2 supports 128bit vectors for all types;
1617   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
1618   //   * AVX2 supports 256bit vectors for all types;
1619   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
1620   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1621   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1622   // And MaxVectorSize is taken into account as well.
1623   if (!vector_size_supported(bt, vlen)) {
1624     return false;
1625   }
1626   // Special cases which require vector length follow:
1627   //   * implementation limitations
1628   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1629   //   * 128bit vroundpd instruction is present only in AVX1
1630   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1631   switch (opcode) {
1632     case Op_AbsVF:
1633     case Op_NegVF:
1634       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1635         return false; // 512bit vandps and vxorps are not available
1636       }
1637       break;
1638     case Op_AbsVD:
1639     case Op_NegVD:
1640     case Op_MulVL:
1641       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1642         return false; // 512bit vpmullq, vandpd and vxorpd are not available
1643       }
1644       break;
1645     case Op_CMoveVF:
1646       if (vlen != 8) {
1647         return false; // implementation limitation (only vcmov8F_reg is present)
1648       }
1649       break;
1650     case Op_RotateRightV:
1651     case Op_RotateLeftV:
1652       if (bt != T_INT && bt != T_LONG) {
1653         return false;
1654       } // fallthrough
1655     case Op_MacroLogicV:
1656       if (!VM_Version::supports_evex() ||
1657           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
1658         return false;
1659       }
1660       break;
1661     case Op_ClearArray:
1662     case Op_VectorMaskGen:
1663     case Op_VectorCmpMasked:
1664     case Op_LoadVectorMasked:
1665     case Op_StoreVectorMasked:
1666       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
1667         return false;
1668       }
1669       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
1670         return false;
1671       }
1672       break;
1673     case Op_CMoveVD:
1674       if (vlen != 4) {
1675         return false; // implementation limitation (only vcmov4D_reg is present)
1676       }
1677       break;
1678     case Op_MaxV:
1679     case Op_MinV:
1680       if (UseSSE < 4 && is_integral_type(bt)) {
1681         return false;
1682       }
1683       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
1684           // Float/Double intrinsics are enabled for AVX family currently.
1685           if (UseAVX == 0) {
1686             return false;
1687           }
1688           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
1689             return false;
1690           }
1691       }
1692       break;
1693     case Op_CallLeafVector:
1694       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
1695         return false;
1696       }
1697       break;
1698     case Op_AddReductionVI:
1699       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
1700         return false;
1701       }
1702       // fallthrough
1703     case Op_AndReductionV:
1704     case Op_OrReductionV:
1705     case Op_XorReductionV:
1706       if (is_subword_type(bt) && (UseSSE < 4)) {
1707         return false;
1708       }
1709 #ifndef _LP64
1710       if (bt == T_BYTE || bt == T_LONG) {
1711         return false;
1712       }
1713 #endif
1714       break;
1715 #ifndef _LP64
1716     case Op_VectorInsert:
1717       if (bt == T_LONG || bt == T_DOUBLE) {
1718         return false;
1719       }
1720       break;
1721 #endif
1722     case Op_MinReductionV:
1723     case Op_MaxReductionV:
1724       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
1725         return false;
1726       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
1727         return false;
1728       }
1729       // Float/Double intrinsics enabled for AVX family.
1730       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
1731         return false;
1732       }
1733       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
1734         return false;
1735       }
1736 #ifndef _LP64
1737       if (bt == T_BYTE || bt == T_LONG) {
1738         return false;
1739       }
1740 #endif
1741       break;
1742     case Op_VectorTest:
1743       if (UseSSE < 4) {
1744         return false; // Implementation limitation
1745       } else if (size_in_bits < 32) {
1746         return false; // Implementation limitation
1747       } else if (size_in_bits == 512 && (VM_Version::supports_avx512bw() == false)) {
1748         return false; // Implementation limitation
1749       }
1750       break;
1751     case Op_VectorLoadShuffle:
1752     case Op_VectorRearrange:
1753       if(vlen == 2) {
1754         return false; // Implementation limitation due to how shuffle is loaded
1755       } else if (size_in_bits == 256 && UseAVX < 2) {
1756         return false; // Implementation limitation
1757       } else if (bt == T_BYTE && size_in_bits > 256 && !VM_Version::supports_avx512_vbmi())  {
1758         return false; // Implementation limitation
1759       } else if (bt == T_SHORT && size_in_bits > 256 && !VM_Version::supports_avx512bw())  {
1760         return false; // Implementation limitation
1761       }
1762       break;
1763     case Op_VectorLoadMask:
1764       if (size_in_bits == 256 && UseAVX < 2) {
1765         return false; // Implementation limitation
1766       }
1767       // fallthrough
1768     case Op_VectorStoreMask:
1769       if (vlen == 2) {
1770         return false; // Implementation limitation
1771       }
1772       break;
1773     case Op_VectorCastB2X:
1774       if (size_in_bits == 256 && UseAVX < 2) {
1775         return false; // Implementation limitation
1776       }
1777       break;
1778     case Op_VectorCastS2X:
1779       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1780         return false;
1781       }
1782       break;
1783     case Op_VectorCastI2X:
1784       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1785         return false;
1786       }
1787       break;
1788     case Op_VectorCastL2X:
1789       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1790         return false;
1791       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
1792         return false;
1793       }
1794       break;
1795     case Op_VectorCastF2X:
1796     case Op_VectorCastD2X:
1797       if (is_integral_type(bt)) {
1798         // Casts from FP to integral types require special fixup logic not easily
1799         // implementable with vectors.
1800         return false; // Implementation limitation
1801       }
1802     case Op_MulReductionVI:
1803       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
1804         return false;
1805       }
1806       break;
1807     case Op_LoadVectorGatherMasked:
1808     case Op_StoreVectorScatterMasked:
1809     case Op_StoreVectorScatter:
1810       if(is_subword_type(bt)) {
1811         return false;
1812       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1813         return false;
1814       }
1815       // fallthrough
1816     case Op_LoadVectorGather:
1817       if (size_in_bits == 64 ) {
1818         return false;
1819       }
1820       break;
1821     case Op_MaskAll:
1822       if (!is_LP64 || !VM_Version::supports_evex()) {
1823         return false;
1824       }
1825       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
1826         return false;
1827       }
1828       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1829         return false;
1830       }
1831       break;
1832     case Op_VectorMaskCmp:
1833       if (vlen < 2 || size_in_bits < 32) {
1834         return false;
1835       }
1836       break;
1837   }
1838   return true;  // Per default match rules are supported.
1839 }
1840 
1841 const bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
1842   // ADLC based match_rule_supported routine checks for the existence of pattern based
1843   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
1844   // of their non-masked counterpart with mask edge being the differentiator.
1845   // This routine does a strict check on the existence of masked operation patterns
1846   // by returning a default false value for all the other opcodes apart from the
1847   // ones whose masked instruction patterns are defined in this file.
1848   if (!match_rule_supported_vector(opcode, vlen, bt)) {
1849     return false;
1850   }
1851 
1852   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1853   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1854   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
1855     return false;
1856   }
1857   switch(opcode) {
1858     // Unary masked operations
1859     case Op_AbsVB:
1860     case Op_AbsVS:
1861       if(!VM_Version::supports_avx512bw()) {
1862         return false;  // Implementation limitation
1863       }
1864     case Op_AbsVI:
1865     case Op_AbsVL:
1866       return true;
1867 
1868     // Ternary masked operations
1869     case Op_FmaVF:
1870     case Op_FmaVD:
1871       return true;
1872 
1873     // Binary masked operations
1874     case Op_AddVB:
1875     case Op_AddVS:
1876     case Op_SubVB:
1877     case Op_SubVS:
1878     case Op_MulVS:
1879     case Op_LShiftVS:
1880     case Op_RShiftVS:
1881     case Op_URShiftVS:
1882       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
1883       if (!VM_Version::supports_avx512bw()) {
1884         return false;  // Implementation limitation
1885       }
1886       return true;
1887 
1888     case Op_MulVL:
1889       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
1890       if (!VM_Version::supports_avx512dq()) {
1891         return false;  // Implementation limitation
1892       }
1893       return true;
1894 
1895     case Op_AndV:
1896     case Op_OrV:
1897     case Op_XorV:
1898     case Op_RotateRightV:
1899     case Op_RotateLeftV:
1900       if (bt != T_INT && bt != T_LONG) {
1901         return false; // Implementation limitation
1902       }
1903       return true;
1904 
1905     case Op_VectorLoadMask:
1906       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
1907       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
1908         return false;
1909       }
1910       return true;
1911 
1912     case Op_AddVI:
1913     case Op_AddVL:
1914     case Op_AddVF:
1915     case Op_AddVD:
1916     case Op_SubVI:
1917     case Op_SubVL:
1918     case Op_SubVF:
1919     case Op_SubVD:
1920     case Op_MulVI:
1921     case Op_MulVF:
1922     case Op_MulVD:
1923     case Op_DivVF:
1924     case Op_DivVD:
1925     case Op_SqrtVF:
1926     case Op_SqrtVD:
1927     case Op_LShiftVI:
1928     case Op_LShiftVL:
1929     case Op_RShiftVI:
1930     case Op_RShiftVL:
1931     case Op_URShiftVI:
1932     case Op_URShiftVL:
1933     case Op_LoadVectorMasked:
1934     case Op_StoreVectorMasked:
1935     case Op_LoadVectorGatherMasked:
1936     case Op_StoreVectorScatterMasked:
1937       return true;
1938 
1939     case Op_MaxV:
1940     case Op_MinV:
1941       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
1942         return false; // Implementation limitation
1943       }
1944       if (is_floating_point_type(bt)) {
1945         return false; // Implementation limitation
1946       }
1947       return true;
1948 
1949     case Op_VectorMaskCmp:
1950       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
1951         return false; // Implementation limitation
1952       }
1953       return true;
1954 
1955     case Op_VectorRearrange:
1956       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
1957         return false; // Implementation limitation
1958       }
1959       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
1960         return false; // Implementation limitation
1961       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
1962         return false; // Implementation limitation
1963       }
1964       return true;
1965 
1966     // Binary Logical operations
1967     case Op_AndVMask:
1968     case Op_OrVMask:
1969     case Op_XorVMask:
1970       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
1971         return false; // Implementation limitation
1972       }
1973       return true;
1974 
1975     case Op_MaskAll:
1976       return true;
1977 
1978     default:
1979       return false;
1980   }
1981 }
1982 
1983 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
1984   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
1985   bool legacy = (generic_opnd->opcode() == LEGVEC);
1986   if (!VM_Version::supports_avx512vlbwdq() && // KNL
1987       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
1988     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
1989     return new legVecZOper();
1990   }
1991   if (legacy) {
1992     switch (ideal_reg) {
1993       case Op_VecS: return new legVecSOper();
1994       case Op_VecD: return new legVecDOper();
1995       case Op_VecX: return new legVecXOper();
1996       case Op_VecY: return new legVecYOper();
1997       case Op_VecZ: return new legVecZOper();
1998     }
1999   } else {
2000     switch (ideal_reg) {
2001       case Op_VecS: return new vecSOper();
2002       case Op_VecD: return new vecDOper();
2003       case Op_VecX: return new vecXOper();
2004       case Op_VecY: return new vecYOper();
2005       case Op_VecZ: return new vecZOper();
2006     }
2007   }
2008   ShouldNotReachHere();
2009   return NULL;
2010 }
2011 
2012 bool Matcher::is_reg2reg_move(MachNode* m) {
2013   switch (m->rule()) {
2014     case MoveVec2Leg_rule:
2015     case MoveLeg2Vec_rule:
2016     case MoveF2VL_rule:
2017     case MoveF2LEG_rule:
2018     case MoveVL2F_rule:
2019     case MoveLEG2F_rule:
2020     case MoveD2VL_rule:
2021     case MoveD2LEG_rule:
2022     case MoveVL2D_rule:
2023     case MoveLEG2D_rule:
2024       return true;
2025     default:
2026       return false;
2027   }
2028 }
2029 
2030 bool Matcher::is_generic_vector(MachOper* opnd) {
2031   switch (opnd->opcode()) {
2032     case VEC:
2033     case LEGVEC:
2034       return true;
2035     default:
2036       return false;
2037   }
2038 }
2039 
2040 //------------------------------------------------------------------------
2041 
2042 const RegMask* Matcher::predicate_reg_mask(void) {
2043   return &_VECTMASK_REG_mask;
2044 }
2045 
2046 const TypeVect* Matcher::predicate_reg_type(const Type* elemTy, int length) {
2047   return new TypeVectMask(elemTy, length);
2048 }
2049 
2050 // Max vector size in bytes. 0 if not supported.
2051 const int Matcher::vector_width_in_bytes(BasicType bt) {
2052   assert(is_java_primitive(bt), "only primitive type vectors");
2053   if (UseSSE < 2) return 0;
2054   // SSE2 supports 128bit vectors for all types.
2055   // AVX2 supports 256bit vectors for all types.
2056   // AVX2/EVEX supports 512bit vectors for all types.
2057   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
2058   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
2059   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
2060     size = (UseAVX > 2) ? 64 : 32;
2061   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
2062     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
2063   // Use flag to limit vector size.
2064   size = MIN2(size,(int)MaxVectorSize);
2065   // Minimum 2 values in vector (or 4 for bytes).
2066   switch (bt) {
2067   case T_DOUBLE:
2068   case T_LONG:
2069     if (size < 16) return 0;
2070     break;
2071   case T_FLOAT:
2072   case T_INT:
2073     if (size < 8) return 0;
2074     break;
2075   case T_BOOLEAN:
2076     if (size < 4) return 0;
2077     break;
2078   case T_CHAR:
2079     if (size < 4) return 0;
2080     break;
2081   case T_BYTE:
2082     if (size < 4) return 0;
2083     break;
2084   case T_SHORT:
2085     if (size < 4) return 0;
2086     break;
2087   default:
2088     ShouldNotReachHere();
2089   }
2090   return size;
2091 }
2092 
2093 // Limits on vector size (number of elements) loaded into vector.
2094 const int Matcher::max_vector_size(const BasicType bt) {
2095   return vector_width_in_bytes(bt)/type2aelembytes(bt);
2096 }
2097 const int Matcher::min_vector_size(const BasicType bt) {
2098   int max_size = max_vector_size(bt);
2099   // Min size which can be loaded into vector is 4 bytes.
2100   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
2101   // Support for calling svml double64 vectors
2102   if (bt == T_DOUBLE) {
2103     size = 1;
2104   }
2105   return MIN2(size,max_size);
2106 }
2107 
2108 const int Matcher::scalable_vector_reg_size(const BasicType bt) {
2109   return -1;
2110 }
2111 
2112 // Vector ideal reg corresponding to specified size in bytes
2113 const uint Matcher::vector_ideal_reg(int size) {
2114   assert(MaxVectorSize >= size, "");
2115   switch(size) {
2116     case  4: return Op_VecS;
2117     case  8: return Op_VecD;
2118     case 16: return Op_VecX;
2119     case 32: return Op_VecY;
2120     case 64: return Op_VecZ;
2121   }
2122   ShouldNotReachHere();
2123   return 0;
2124 }
2125 
2126 // Check for shift by small constant as well
2127 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
2128   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
2129       shift->in(2)->get_int() <= 3 &&
2130       // Are there other uses besides address expressions?
2131       !matcher->is_visited(shift)) {
2132     address_visited.set(shift->_idx); // Flag as address_visited
2133     mstack.push(shift->in(2), Matcher::Visit);
2134     Node *conv = shift->in(1);
2135 #ifdef _LP64
2136     // Allow Matcher to match the rule which bypass
2137     // ConvI2L operation for an array index on LP64
2138     // if the index value is positive.
2139     if (conv->Opcode() == Op_ConvI2L &&
2140         conv->as_Type()->type()->is_long()->_lo >= 0 &&
2141         // Are there other uses besides address expressions?
2142         !matcher->is_visited(conv)) {
2143       address_visited.set(conv->_idx); // Flag as address_visited
2144       mstack.push(conv->in(1), Matcher::Pre_Visit);
2145     } else
2146 #endif
2147       mstack.push(conv, Matcher::Pre_Visit);
2148     return true;
2149   }
2150   return false;
2151 }
2152 
2153 // This function identifies sub-graphs in which a 'load' node is
2154 // input to two different nodes, and such that it can be matched
2155 // with BMI instructions like blsi, blsr, etc.
2156 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
2157 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
2158 // refers to the same node.
2159 //
2160 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
2161 // This is a temporary solution until we make DAGs expressible in ADL.
2162 template<typename ConType>
2163 class FusedPatternMatcher {
2164   Node* _op1_node;
2165   Node* _mop_node;
2166   int _con_op;
2167 
2168   static int match_next(Node* n, int next_op, int next_op_idx) {
2169     if (n->in(1) == NULL || n->in(2) == NULL) {
2170       return -1;
2171     }
2172 
2173     if (next_op_idx == -1) { // n is commutative, try rotations
2174       if (n->in(1)->Opcode() == next_op) {
2175         return 1;
2176       } else if (n->in(2)->Opcode() == next_op) {
2177         return 2;
2178       }
2179     } else {
2180       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
2181       if (n->in(next_op_idx)->Opcode() == next_op) {
2182         return next_op_idx;
2183       }
2184     }
2185     return -1;
2186   }
2187 
2188  public:
2189   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
2190     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
2191 
2192   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
2193              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
2194              typename ConType::NativeType con_value) {
2195     if (_op1_node->Opcode() != op1) {
2196       return false;
2197     }
2198     if (_mop_node->outcnt() > 2) {
2199       return false;
2200     }
2201     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
2202     if (op1_op2_idx == -1) {
2203       return false;
2204     }
2205     // Memory operation must be the other edge
2206     int op1_mop_idx = (op1_op2_idx & 1) + 1;
2207 
2208     // Check that the mop node is really what we want
2209     if (_op1_node->in(op1_mop_idx) == _mop_node) {
2210       Node* op2_node = _op1_node->in(op1_op2_idx);
2211       if (op2_node->outcnt() > 1) {
2212         return false;
2213       }
2214       assert(op2_node->Opcode() == op2, "Should be");
2215       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
2216       if (op2_con_idx == -1) {
2217         return false;
2218       }
2219       // Memory operation must be the other edge
2220       int op2_mop_idx = (op2_con_idx & 1) + 1;
2221       // Check that the memory operation is the same node
2222       if (op2_node->in(op2_mop_idx) == _mop_node) {
2223         // Now check the constant
2224         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
2225         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
2226           return true;
2227         }
2228       }
2229     }
2230     return false;
2231   }
2232 };
2233 
2234 static bool is_bmi_pattern(Node* n, Node* m) {
2235   assert(UseBMI1Instructions, "sanity");
2236   if (n != NULL && m != NULL) {
2237     if (m->Opcode() == Op_LoadI) {
2238       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
2239       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
2240              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
2241              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
2242     } else if (m->Opcode() == Op_LoadL) {
2243       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
2244       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
2245              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
2246              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
2247     }
2248   }
2249   return false;
2250 }
2251 
2252 // Should the matcher clone input 'm' of node 'n'?
2253 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
2254   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
2255   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
2256     mstack.push(m, Visit);
2257     return true;
2258   }
2259   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
2260     mstack.push(m, Visit);           // m = ShiftCntV
2261     return true;
2262   }
2263   return false;
2264 }
2265 
2266 // Should the Matcher clone shifts on addressing modes, expecting them
2267 // to be subsumed into complex addressing expressions or compute them
2268 // into registers?
2269 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
2270   Node *off = m->in(AddPNode::Offset);
2271   if (off->is_Con()) {
2272     address_visited.test_set(m->_idx); // Flag as address_visited
2273     Node *adr = m->in(AddPNode::Address);
2274 
2275     // Intel can handle 2 adds in addressing mode
2276     // AtomicAdd is not an addressing expression.
2277     // Cheap to find it by looking for screwy base.
2278     if (adr->is_AddP() &&
2279         !adr->in(AddPNode::Base)->is_top() &&
2280         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
2281         // Are there other uses besides address expressions?
2282         !is_visited(adr)) {
2283       address_visited.set(adr->_idx); // Flag as address_visited
2284       Node *shift = adr->in(AddPNode::Offset);
2285       if (!clone_shift(shift, this, mstack, address_visited)) {
2286         mstack.push(shift, Pre_Visit);
2287       }
2288       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
2289       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
2290     } else {
2291       mstack.push(adr, Pre_Visit);
2292     }
2293 
2294     // Clone X+offset as it also folds into most addressing expressions
2295     mstack.push(off, Visit);
2296     mstack.push(m->in(AddPNode::Base), Pre_Visit);
2297     return true;
2298   } else if (clone_shift(off, this, mstack, address_visited)) {
2299     address_visited.test_set(m->_idx); // Flag as address_visited
2300     mstack.push(m->in(AddPNode::Address), Pre_Visit);
2301     mstack.push(m->in(AddPNode::Base), Pre_Visit);
2302     return true;
2303   }
2304   return false;
2305 }
2306 
2307 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
2308   switch (bt) {
2309     case BoolTest::eq:
2310       return Assembler::eq;
2311     case BoolTest::ne:
2312       return Assembler::neq;
2313     case BoolTest::le:
2314     case BoolTest::ule:
2315       return Assembler::le;
2316     case BoolTest::ge:
2317     case BoolTest::uge:
2318       return Assembler::nlt;
2319     case BoolTest::lt:
2320     case BoolTest::ult:
2321       return Assembler::lt;
2322     case BoolTest::gt:
2323     case BoolTest::ugt:
2324       return Assembler::nle;
2325     default : ShouldNotReachHere(); return Assembler::_false;
2326   }
2327 }
2328 
2329 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
2330   switch (bt) {
2331   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
2332   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
2333   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
2334   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
2335   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
2336   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
2337   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
2338   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
2339   }
2340 }
2341 
2342 // Helper methods for MachSpillCopyNode::implementation().
2343 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
2344                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
2345   assert(ireg == Op_VecS || // 32bit vector
2346          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
2347          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
2348          "no non-adjacent vector moves" );
2349   if (cbuf) {
2350     C2_MacroAssembler _masm(cbuf);
2351     switch (ireg) {
2352     case Op_VecS: // copy whole register
2353     case Op_VecD:
2354     case Op_VecX:
2355 #ifndef _LP64
2356       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2357 #else
2358       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2359         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2360       } else {
2361         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2362      }
2363 #endif
2364       break;
2365     case Op_VecY:
2366 #ifndef _LP64
2367       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2368 #else
2369       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2370         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2371       } else {
2372         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2373      }
2374 #endif
2375       break;
2376     case Op_VecZ:
2377       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
2378       break;
2379     default:
2380       ShouldNotReachHere();
2381     }
2382 #ifndef PRODUCT
2383   } else {
2384     switch (ireg) {
2385     case Op_VecS:
2386     case Op_VecD:
2387     case Op_VecX:
2388       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2389       break;
2390     case Op_VecY:
2391     case Op_VecZ:
2392       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2393       break;
2394     default:
2395       ShouldNotReachHere();
2396     }
2397 #endif
2398   }
2399 }
2400 
2401 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
2402                      int stack_offset, int reg, uint ireg, outputStream* st) {
2403   if (cbuf) {
2404     C2_MacroAssembler _masm(cbuf);
2405     if (is_load) {
2406       switch (ireg) {
2407       case Op_VecS:
2408         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2409         break;
2410       case Op_VecD:
2411         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2412         break;
2413       case Op_VecX:
2414 #ifndef _LP64
2415         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2416 #else
2417         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2418           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2419         } else {
2420           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2421           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2422         }
2423 #endif
2424         break;
2425       case Op_VecY:
2426 #ifndef _LP64
2427         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2428 #else
2429         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2430           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2431         } else {
2432           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2433           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2434         }
2435 #endif
2436         break;
2437       case Op_VecZ:
2438         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
2439         break;
2440       default:
2441         ShouldNotReachHere();
2442       }
2443     } else { // store
2444       switch (ireg) {
2445       case Op_VecS:
2446         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2447         break;
2448       case Op_VecD:
2449         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2450         break;
2451       case Op_VecX:
2452 #ifndef _LP64
2453         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2454 #else
2455         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2456           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2457         }
2458         else {
2459           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2460         }
2461 #endif
2462         break;
2463       case Op_VecY:
2464 #ifndef _LP64
2465         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2466 #else
2467         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2468           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2469         }
2470         else {
2471           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2472         }
2473 #endif
2474         break;
2475       case Op_VecZ:
2476         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2477         break;
2478       default:
2479         ShouldNotReachHere();
2480       }
2481     }
2482 #ifndef PRODUCT
2483   } else {
2484     if (is_load) {
2485       switch (ireg) {
2486       case Op_VecS:
2487         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2488         break;
2489       case Op_VecD:
2490         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2491         break;
2492        case Op_VecX:
2493         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2494         break;
2495       case Op_VecY:
2496       case Op_VecZ:
2497         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2498         break;
2499       default:
2500         ShouldNotReachHere();
2501       }
2502     } else { // store
2503       switch (ireg) {
2504       case Op_VecS:
2505         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2506         break;
2507       case Op_VecD:
2508         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2509         break;
2510        case Op_VecX:
2511         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2512         break;
2513       case Op_VecY:
2514       case Op_VecZ:
2515         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2516         break;
2517       default:
2518         ShouldNotReachHere();
2519       }
2520     }
2521 #endif
2522   }
2523 }
2524 
2525 static inline jlong replicate8_imm(int con, int width) {
2526   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
2527   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
2528   int bit_width = width * 8;
2529   jlong val = con;
2530   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
2531   while(bit_width < 64) {
2532     val |= (val << bit_width);
2533     bit_width <<= 1;
2534   }
2535   return val;
2536 }
2537 
2538 #ifndef PRODUCT
2539   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
2540     st->print("nop \t# %d bytes pad for loops and calls", _count);
2541   }
2542 #endif
2543 
2544   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
2545     C2_MacroAssembler _masm(&cbuf);
2546     __ nop(_count);
2547   }
2548 
2549   uint MachNopNode::size(PhaseRegAlloc*) const {
2550     return _count;
2551   }
2552 
2553 #ifndef PRODUCT
2554   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2555     st->print("# breakpoint");
2556   }
2557 #endif
2558 
2559   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
2560     C2_MacroAssembler _masm(&cbuf);
2561     __ int3();
2562   }
2563 
2564   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2565     return MachNode::size(ra_);
2566   }
2567 
2568 %}
2569 
2570 encode %{
2571 
2572   enc_class call_epilog %{
2573     C2_MacroAssembler _masm(&cbuf);
2574     if (VerifyStackAtCalls) {
2575       // Check that stack depth is unchanged: find majik cookie on stack
2576       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));

2577       Label L;
2578       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2579       __ jccb(Assembler::equal, L);
2580       // Die if stack mismatch
2581       __ int3();
2582       __ bind(L);
2583     }
2584     __ oopmap_metadata(-1);
2585   %}
2586 
2587 %}
2588 
2589 // Operands for bound floating pointer register arguments
2590 operand rxmm0() %{
2591   constraint(ALLOC_IN_RC(xmm0_reg));
2592   match(VecX);
2593   format%{%}
2594   interface(REG_INTER);
2595 %}
2596 
2597 //----------OPERANDS-----------------------------------------------------------
2598 // Operand definitions must precede instruction definitions for correct parsing
2599 // in the ADLC because operands constitute user defined types which are used in
2600 // instruction definitions.
2601 
2602 // Vectors
2603 
2604 // Dummy generic vector class. Should be used for all vector operands.
2605 // Replaced with vec[SDXYZ] during post-selection pass.
2606 operand vec() %{
2607   constraint(ALLOC_IN_RC(dynamic));
2608   match(VecX);
2609   match(VecY);
2610   match(VecZ);
2611   match(VecS);
2612   match(VecD);
2613 
2614   format %{ %}
2615   interface(REG_INTER);
2616 %}
2617 
2618 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
2619 // Replaced with legVec[SDXYZ] during post-selection cleanup.
2620 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
2621 // runtime code generation via reg_class_dynamic.
2622 operand legVec() %{
2623   constraint(ALLOC_IN_RC(dynamic));
2624   match(VecX);
2625   match(VecY);
2626   match(VecZ);
2627   match(VecS);
2628   match(VecD);
2629 
2630   format %{ %}
2631   interface(REG_INTER);
2632 %}
2633 
2634 // Replaces vec during post-selection cleanup. See above.
2635 operand vecS() %{
2636   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
2637   match(VecS);
2638 
2639   format %{ %}
2640   interface(REG_INTER);
2641 %}
2642 
2643 // Replaces legVec during post-selection cleanup. See above.
2644 operand legVecS() %{
2645   constraint(ALLOC_IN_RC(vectors_reg_legacy));
2646   match(VecS);
2647 
2648   format %{ %}
2649   interface(REG_INTER);
2650 %}
2651 
2652 // Replaces vec during post-selection cleanup. See above.
2653 operand vecD() %{
2654   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
2655   match(VecD);
2656 
2657   format %{ %}
2658   interface(REG_INTER);
2659 %}
2660 
2661 // Replaces legVec during post-selection cleanup. See above.
2662 operand legVecD() %{
2663   constraint(ALLOC_IN_RC(vectord_reg_legacy));
2664   match(VecD);
2665 
2666   format %{ %}
2667   interface(REG_INTER);
2668 %}
2669 
2670 // Replaces vec during post-selection cleanup. See above.
2671 operand vecX() %{
2672   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
2673   match(VecX);
2674 
2675   format %{ %}
2676   interface(REG_INTER);
2677 %}
2678 
2679 // Replaces legVec during post-selection cleanup. See above.
2680 operand legVecX() %{
2681   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
2682   match(VecX);
2683 
2684   format %{ %}
2685   interface(REG_INTER);
2686 %}
2687 
2688 // Replaces vec during post-selection cleanup. See above.
2689 operand vecY() %{
2690   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
2691   match(VecY);
2692 
2693   format %{ %}
2694   interface(REG_INTER);
2695 %}
2696 
2697 // Replaces legVec during post-selection cleanup. See above.
2698 operand legVecY() %{
2699   constraint(ALLOC_IN_RC(vectory_reg_legacy));
2700   match(VecY);
2701 
2702   format %{ %}
2703   interface(REG_INTER);
2704 %}
2705 
2706 // Replaces vec during post-selection cleanup. See above.
2707 operand vecZ() %{
2708   constraint(ALLOC_IN_RC(vectorz_reg));
2709   match(VecZ);
2710 
2711   format %{ %}
2712   interface(REG_INTER);
2713 %}
2714 
2715 // Replaces legVec during post-selection cleanup. See above.
2716 operand legVecZ() %{
2717   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
2718   match(VecZ);
2719 
2720   format %{ %}
2721   interface(REG_INTER);
2722 %}
2723 
2724 // Comparison Code for FP conditional move
2725 operand cmpOp_vcmppd() %{
2726   match(Bool);
2727 
2728   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2729             n->as_Bool()->_test._test != BoolTest::no_overflow);
2730   format %{ "" %}
2731   interface(COND_INTER) %{
2732     equal        (0x0, "eq");
2733     less         (0x1, "lt");
2734     less_equal   (0x2, "le");
2735     not_equal    (0xC, "ne");
2736     greater_equal(0xD, "ge");
2737     greater      (0xE, "gt");
2738     //TODO cannot compile (adlc breaks) without two next lines with error:
2739     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2740     // equal' for overflow.
2741     overflow     (0x20, "o");  // not really supported by the instruction
2742     no_overflow  (0x21, "no"); // not really supported by the instruction
2743   %}
2744 %}
2745 
2746 
2747 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2748 
2749 // ============================================================================
2750 
2751 instruct ShouldNotReachHere() %{
2752   match(Halt);
2753   format %{ "stop\t# ShouldNotReachHere" %}
2754   ins_encode %{
2755     if (is_reachable()) {
2756       __ stop(_halt_reason);
2757     }
2758   %}
2759   ins_pipe(pipe_slow);
2760 %}
2761 
2762 // =================================EVEX special===============================
2763 // Existing partial implementation for post-loop multi-versioning computes
2764 // the mask corresponding to tail loop in K1 opmask register. This may then be
2765 // used for predicating instructions in loop body during last post-loop iteration.
2766 // TODO: Remove hard-coded K1 usage while fixing existing post-loop
2767 // multiversioning support.
2768 instruct setMask(rRegI dst, rRegI src, kReg_K1 mask) %{
2769   predicate(PostLoopMultiversioning && Matcher::has_predicated_vectors());
2770   match(Set dst (SetVectMaskI  src));
2771   effect(TEMP dst);
2772   format %{ "setvectmask   $dst, $src" %}
2773   ins_encode %{
2774     __ setvectmask($dst$$Register, $src$$Register, $mask$$KRegister);
2775   %}
2776   ins_pipe(pipe_slow);
2777 %}
2778 
2779 // ============================================================================
2780 
2781 instruct addF_reg(regF dst, regF src) %{
2782   predicate((UseSSE>=1) && (UseAVX == 0));
2783   match(Set dst (AddF dst src));
2784 
2785   format %{ "addss   $dst, $src" %}
2786   ins_cost(150);
2787   ins_encode %{
2788     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2789   %}
2790   ins_pipe(pipe_slow);
2791 %}
2792 
2793 instruct addF_mem(regF dst, memory src) %{
2794   predicate((UseSSE>=1) && (UseAVX == 0));
2795   match(Set dst (AddF dst (LoadF src)));
2796 
2797   format %{ "addss   $dst, $src" %}
2798   ins_cost(150);
2799   ins_encode %{
2800     __ addss($dst$$XMMRegister, $src$$Address);
2801   %}
2802   ins_pipe(pipe_slow);
2803 %}
2804 
2805 instruct addF_imm(regF dst, immF con) %{
2806   predicate((UseSSE>=1) && (UseAVX == 0));
2807   match(Set dst (AddF dst con));
2808   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2809   ins_cost(150);
2810   ins_encode %{
2811     __ addss($dst$$XMMRegister, $constantaddress($con));
2812   %}
2813   ins_pipe(pipe_slow);
2814 %}
2815 
2816 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2817   predicate(UseAVX > 0);
2818   match(Set dst (AddF src1 src2));
2819 
2820   format %{ "vaddss  $dst, $src1, $src2" %}
2821   ins_cost(150);
2822   ins_encode %{
2823     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2824   %}
2825   ins_pipe(pipe_slow);
2826 %}
2827 
2828 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2829   predicate(UseAVX > 0);
2830   match(Set dst (AddF src1 (LoadF src2)));
2831 
2832   format %{ "vaddss  $dst, $src1, $src2" %}
2833   ins_cost(150);
2834   ins_encode %{
2835     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2836   %}
2837   ins_pipe(pipe_slow);
2838 %}
2839 
2840 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2841   predicate(UseAVX > 0);
2842   match(Set dst (AddF src con));
2843 
2844   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2845   ins_cost(150);
2846   ins_encode %{
2847     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2848   %}
2849   ins_pipe(pipe_slow);
2850 %}
2851 
2852 instruct addD_reg(regD dst, regD src) %{
2853   predicate((UseSSE>=2) && (UseAVX == 0));
2854   match(Set dst (AddD dst src));
2855 
2856   format %{ "addsd   $dst, $src" %}
2857   ins_cost(150);
2858   ins_encode %{
2859     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2860   %}
2861   ins_pipe(pipe_slow);
2862 %}
2863 
2864 instruct addD_mem(regD dst, memory src) %{
2865   predicate((UseSSE>=2) && (UseAVX == 0));
2866   match(Set dst (AddD dst (LoadD src)));
2867 
2868   format %{ "addsd   $dst, $src" %}
2869   ins_cost(150);
2870   ins_encode %{
2871     __ addsd($dst$$XMMRegister, $src$$Address);
2872   %}
2873   ins_pipe(pipe_slow);
2874 %}
2875 
2876 instruct addD_imm(regD dst, immD con) %{
2877   predicate((UseSSE>=2) && (UseAVX == 0));
2878   match(Set dst (AddD dst con));
2879   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2880   ins_cost(150);
2881   ins_encode %{
2882     __ addsd($dst$$XMMRegister, $constantaddress($con));
2883   %}
2884   ins_pipe(pipe_slow);
2885 %}
2886 
2887 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2888   predicate(UseAVX > 0);
2889   match(Set dst (AddD src1 src2));
2890 
2891   format %{ "vaddsd  $dst, $src1, $src2" %}
2892   ins_cost(150);
2893   ins_encode %{
2894     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2895   %}
2896   ins_pipe(pipe_slow);
2897 %}
2898 
2899 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2900   predicate(UseAVX > 0);
2901   match(Set dst (AddD src1 (LoadD src2)));
2902 
2903   format %{ "vaddsd  $dst, $src1, $src2" %}
2904   ins_cost(150);
2905   ins_encode %{
2906     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2907   %}
2908   ins_pipe(pipe_slow);
2909 %}
2910 
2911 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2912   predicate(UseAVX > 0);
2913   match(Set dst (AddD src con));
2914 
2915   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2916   ins_cost(150);
2917   ins_encode %{
2918     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2919   %}
2920   ins_pipe(pipe_slow);
2921 %}
2922 
2923 instruct subF_reg(regF dst, regF src) %{
2924   predicate((UseSSE>=1) && (UseAVX == 0));
2925   match(Set dst (SubF dst src));
2926 
2927   format %{ "subss   $dst, $src" %}
2928   ins_cost(150);
2929   ins_encode %{
2930     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2931   %}
2932   ins_pipe(pipe_slow);
2933 %}
2934 
2935 instruct subF_mem(regF dst, memory src) %{
2936   predicate((UseSSE>=1) && (UseAVX == 0));
2937   match(Set dst (SubF dst (LoadF src)));
2938 
2939   format %{ "subss   $dst, $src" %}
2940   ins_cost(150);
2941   ins_encode %{
2942     __ subss($dst$$XMMRegister, $src$$Address);
2943   %}
2944   ins_pipe(pipe_slow);
2945 %}
2946 
2947 instruct subF_imm(regF dst, immF con) %{
2948   predicate((UseSSE>=1) && (UseAVX == 0));
2949   match(Set dst (SubF dst con));
2950   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2951   ins_cost(150);
2952   ins_encode %{
2953     __ subss($dst$$XMMRegister, $constantaddress($con));
2954   %}
2955   ins_pipe(pipe_slow);
2956 %}
2957 
2958 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2959   predicate(UseAVX > 0);
2960   match(Set dst (SubF src1 src2));
2961 
2962   format %{ "vsubss  $dst, $src1, $src2" %}
2963   ins_cost(150);
2964   ins_encode %{
2965     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2966   %}
2967   ins_pipe(pipe_slow);
2968 %}
2969 
2970 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2971   predicate(UseAVX > 0);
2972   match(Set dst (SubF src1 (LoadF src2)));
2973 
2974   format %{ "vsubss  $dst, $src1, $src2" %}
2975   ins_cost(150);
2976   ins_encode %{
2977     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2978   %}
2979   ins_pipe(pipe_slow);
2980 %}
2981 
2982 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2983   predicate(UseAVX > 0);
2984   match(Set dst (SubF src con));
2985 
2986   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2987   ins_cost(150);
2988   ins_encode %{
2989     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2990   %}
2991   ins_pipe(pipe_slow);
2992 %}
2993 
2994 instruct subD_reg(regD dst, regD src) %{
2995   predicate((UseSSE>=2) && (UseAVX == 0));
2996   match(Set dst (SubD dst src));
2997 
2998   format %{ "subsd   $dst, $src" %}
2999   ins_cost(150);
3000   ins_encode %{
3001     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
3002   %}
3003   ins_pipe(pipe_slow);
3004 %}
3005 
3006 instruct subD_mem(regD dst, memory src) %{
3007   predicate((UseSSE>=2) && (UseAVX == 0));
3008   match(Set dst (SubD dst (LoadD src)));
3009 
3010   format %{ "subsd   $dst, $src" %}
3011   ins_cost(150);
3012   ins_encode %{
3013     __ subsd($dst$$XMMRegister, $src$$Address);
3014   %}
3015   ins_pipe(pipe_slow);
3016 %}
3017 
3018 instruct subD_imm(regD dst, immD con) %{
3019   predicate((UseSSE>=2) && (UseAVX == 0));
3020   match(Set dst (SubD dst con));
3021   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3022   ins_cost(150);
3023   ins_encode %{
3024     __ subsd($dst$$XMMRegister, $constantaddress($con));
3025   %}
3026   ins_pipe(pipe_slow);
3027 %}
3028 
3029 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
3030   predicate(UseAVX > 0);
3031   match(Set dst (SubD src1 src2));
3032 
3033   format %{ "vsubsd  $dst, $src1, $src2" %}
3034   ins_cost(150);
3035   ins_encode %{
3036     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3037   %}
3038   ins_pipe(pipe_slow);
3039 %}
3040 
3041 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
3042   predicate(UseAVX > 0);
3043   match(Set dst (SubD src1 (LoadD src2)));
3044 
3045   format %{ "vsubsd  $dst, $src1, $src2" %}
3046   ins_cost(150);
3047   ins_encode %{
3048     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3049   %}
3050   ins_pipe(pipe_slow);
3051 %}
3052 
3053 instruct subD_reg_imm(regD dst, regD src, immD con) %{
3054   predicate(UseAVX > 0);
3055   match(Set dst (SubD src con));
3056 
3057   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3058   ins_cost(150);
3059   ins_encode %{
3060     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3061   %}
3062   ins_pipe(pipe_slow);
3063 %}
3064 
3065 instruct mulF_reg(regF dst, regF src) %{
3066   predicate((UseSSE>=1) && (UseAVX == 0));
3067   match(Set dst (MulF dst src));
3068 
3069   format %{ "mulss   $dst, $src" %}
3070   ins_cost(150);
3071   ins_encode %{
3072     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
3073   %}
3074   ins_pipe(pipe_slow);
3075 %}
3076 
3077 instruct mulF_mem(regF dst, memory src) %{
3078   predicate((UseSSE>=1) && (UseAVX == 0));
3079   match(Set dst (MulF dst (LoadF src)));
3080 
3081   format %{ "mulss   $dst, $src" %}
3082   ins_cost(150);
3083   ins_encode %{
3084     __ mulss($dst$$XMMRegister, $src$$Address);
3085   %}
3086   ins_pipe(pipe_slow);
3087 %}
3088 
3089 instruct mulF_imm(regF dst, immF con) %{
3090   predicate((UseSSE>=1) && (UseAVX == 0));
3091   match(Set dst (MulF dst con));
3092   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3093   ins_cost(150);
3094   ins_encode %{
3095     __ mulss($dst$$XMMRegister, $constantaddress($con));
3096   %}
3097   ins_pipe(pipe_slow);
3098 %}
3099 
3100 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
3101   predicate(UseAVX > 0);
3102   match(Set dst (MulF src1 src2));
3103 
3104   format %{ "vmulss  $dst, $src1, $src2" %}
3105   ins_cost(150);
3106   ins_encode %{
3107     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3108   %}
3109   ins_pipe(pipe_slow);
3110 %}
3111 
3112 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
3113   predicate(UseAVX > 0);
3114   match(Set dst (MulF src1 (LoadF src2)));
3115 
3116   format %{ "vmulss  $dst, $src1, $src2" %}
3117   ins_cost(150);
3118   ins_encode %{
3119     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3120   %}
3121   ins_pipe(pipe_slow);
3122 %}
3123 
3124 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
3125   predicate(UseAVX > 0);
3126   match(Set dst (MulF src con));
3127 
3128   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3129   ins_cost(150);
3130   ins_encode %{
3131     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3132   %}
3133   ins_pipe(pipe_slow);
3134 %}
3135 
3136 instruct mulD_reg(regD dst, regD src) %{
3137   predicate((UseSSE>=2) && (UseAVX == 0));
3138   match(Set dst (MulD dst src));
3139 
3140   format %{ "mulsd   $dst, $src" %}
3141   ins_cost(150);
3142   ins_encode %{
3143     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
3144   %}
3145   ins_pipe(pipe_slow);
3146 %}
3147 
3148 instruct mulD_mem(regD dst, memory src) %{
3149   predicate((UseSSE>=2) && (UseAVX == 0));
3150   match(Set dst (MulD dst (LoadD src)));
3151 
3152   format %{ "mulsd   $dst, $src" %}
3153   ins_cost(150);
3154   ins_encode %{
3155     __ mulsd($dst$$XMMRegister, $src$$Address);
3156   %}
3157   ins_pipe(pipe_slow);
3158 %}
3159 
3160 instruct mulD_imm(regD dst, immD con) %{
3161   predicate((UseSSE>=2) && (UseAVX == 0));
3162   match(Set dst (MulD dst con));
3163   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3164   ins_cost(150);
3165   ins_encode %{
3166     __ mulsd($dst$$XMMRegister, $constantaddress($con));
3167   %}
3168   ins_pipe(pipe_slow);
3169 %}
3170 
3171 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
3172   predicate(UseAVX > 0);
3173   match(Set dst (MulD src1 src2));
3174 
3175   format %{ "vmulsd  $dst, $src1, $src2" %}
3176   ins_cost(150);
3177   ins_encode %{
3178     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3179   %}
3180   ins_pipe(pipe_slow);
3181 %}
3182 
3183 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
3184   predicate(UseAVX > 0);
3185   match(Set dst (MulD src1 (LoadD src2)));
3186 
3187   format %{ "vmulsd  $dst, $src1, $src2" %}
3188   ins_cost(150);
3189   ins_encode %{
3190     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3191   %}
3192   ins_pipe(pipe_slow);
3193 %}
3194 
3195 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
3196   predicate(UseAVX > 0);
3197   match(Set dst (MulD src con));
3198 
3199   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3200   ins_cost(150);
3201   ins_encode %{
3202     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3203   %}
3204   ins_pipe(pipe_slow);
3205 %}
3206 
3207 instruct divF_reg(regF dst, regF src) %{
3208   predicate((UseSSE>=1) && (UseAVX == 0));
3209   match(Set dst (DivF dst src));
3210 
3211   format %{ "divss   $dst, $src" %}
3212   ins_cost(150);
3213   ins_encode %{
3214     __ divss($dst$$XMMRegister, $src$$XMMRegister);
3215   %}
3216   ins_pipe(pipe_slow);
3217 %}
3218 
3219 instruct divF_mem(regF dst, memory src) %{
3220   predicate((UseSSE>=1) && (UseAVX == 0));
3221   match(Set dst (DivF dst (LoadF src)));
3222 
3223   format %{ "divss   $dst, $src" %}
3224   ins_cost(150);
3225   ins_encode %{
3226     __ divss($dst$$XMMRegister, $src$$Address);
3227   %}
3228   ins_pipe(pipe_slow);
3229 %}
3230 
3231 instruct divF_imm(regF dst, immF con) %{
3232   predicate((UseSSE>=1) && (UseAVX == 0));
3233   match(Set dst (DivF dst con));
3234   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3235   ins_cost(150);
3236   ins_encode %{
3237     __ divss($dst$$XMMRegister, $constantaddress($con));
3238   %}
3239   ins_pipe(pipe_slow);
3240 %}
3241 
3242 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
3243   predicate(UseAVX > 0);
3244   match(Set dst (DivF src1 src2));
3245 
3246   format %{ "vdivss  $dst, $src1, $src2" %}
3247   ins_cost(150);
3248   ins_encode %{
3249     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3250   %}
3251   ins_pipe(pipe_slow);
3252 %}
3253 
3254 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
3255   predicate(UseAVX > 0);
3256   match(Set dst (DivF src1 (LoadF src2)));
3257 
3258   format %{ "vdivss  $dst, $src1, $src2" %}
3259   ins_cost(150);
3260   ins_encode %{
3261     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3262   %}
3263   ins_pipe(pipe_slow);
3264 %}
3265 
3266 instruct divF_reg_imm(regF dst, regF src, immF con) %{
3267   predicate(UseAVX > 0);
3268   match(Set dst (DivF src con));
3269 
3270   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3271   ins_cost(150);
3272   ins_encode %{
3273     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3274   %}
3275   ins_pipe(pipe_slow);
3276 %}
3277 
3278 instruct divD_reg(regD dst, regD src) %{
3279   predicate((UseSSE>=2) && (UseAVX == 0));
3280   match(Set dst (DivD dst src));
3281 
3282   format %{ "divsd   $dst, $src" %}
3283   ins_cost(150);
3284   ins_encode %{
3285     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
3286   %}
3287   ins_pipe(pipe_slow);
3288 %}
3289 
3290 instruct divD_mem(regD dst, memory src) %{
3291   predicate((UseSSE>=2) && (UseAVX == 0));
3292   match(Set dst (DivD dst (LoadD src)));
3293 
3294   format %{ "divsd   $dst, $src" %}
3295   ins_cost(150);
3296   ins_encode %{
3297     __ divsd($dst$$XMMRegister, $src$$Address);
3298   %}
3299   ins_pipe(pipe_slow);
3300 %}
3301 
3302 instruct divD_imm(regD dst, immD con) %{
3303   predicate((UseSSE>=2) && (UseAVX == 0));
3304   match(Set dst (DivD dst con));
3305   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3306   ins_cost(150);
3307   ins_encode %{
3308     __ divsd($dst$$XMMRegister, $constantaddress($con));
3309   %}
3310   ins_pipe(pipe_slow);
3311 %}
3312 
3313 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
3314   predicate(UseAVX > 0);
3315   match(Set dst (DivD src1 src2));
3316 
3317   format %{ "vdivsd  $dst, $src1, $src2" %}
3318   ins_cost(150);
3319   ins_encode %{
3320     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3321   %}
3322   ins_pipe(pipe_slow);
3323 %}
3324 
3325 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
3326   predicate(UseAVX > 0);
3327   match(Set dst (DivD src1 (LoadD src2)));
3328 
3329   format %{ "vdivsd  $dst, $src1, $src2" %}
3330   ins_cost(150);
3331   ins_encode %{
3332     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3333   %}
3334   ins_pipe(pipe_slow);
3335 %}
3336 
3337 instruct divD_reg_imm(regD dst, regD src, immD con) %{
3338   predicate(UseAVX > 0);
3339   match(Set dst (DivD src con));
3340 
3341   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3342   ins_cost(150);
3343   ins_encode %{
3344     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3345   %}
3346   ins_pipe(pipe_slow);
3347 %}
3348 
3349 instruct absF_reg(regF dst) %{
3350   predicate((UseSSE>=1) && (UseAVX == 0));
3351   match(Set dst (AbsF dst));
3352   ins_cost(150);
3353   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
3354   ins_encode %{
3355     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
3356   %}
3357   ins_pipe(pipe_slow);
3358 %}
3359 
3360 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
3361   predicate(UseAVX > 0);
3362   match(Set dst (AbsF src));
3363   ins_cost(150);
3364   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
3365   ins_encode %{
3366     int vlen_enc = Assembler::AVX_128bit;
3367     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
3368               ExternalAddress(float_signmask()), vlen_enc);
3369   %}
3370   ins_pipe(pipe_slow);
3371 %}
3372 
3373 instruct absD_reg(regD dst) %{
3374   predicate((UseSSE>=2) && (UseAVX == 0));
3375   match(Set dst (AbsD dst));
3376   ins_cost(150);
3377   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
3378             "# abs double by sign masking" %}
3379   ins_encode %{
3380     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
3381   %}
3382   ins_pipe(pipe_slow);
3383 %}
3384 
3385 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
3386   predicate(UseAVX > 0);
3387   match(Set dst (AbsD src));
3388   ins_cost(150);
3389   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
3390             "# abs double by sign masking" %}
3391   ins_encode %{
3392     int vlen_enc = Assembler::AVX_128bit;
3393     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
3394               ExternalAddress(double_signmask()), vlen_enc);
3395   %}
3396   ins_pipe(pipe_slow);
3397 %}
3398 
3399 instruct negF_reg(regF dst) %{
3400   predicate((UseSSE>=1) && (UseAVX == 0));
3401   match(Set dst (NegF dst));
3402   ins_cost(150);
3403   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
3404   ins_encode %{
3405     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
3406   %}
3407   ins_pipe(pipe_slow);
3408 %}
3409 
3410 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
3411   predicate(UseAVX > 0);
3412   match(Set dst (NegF src));
3413   ins_cost(150);
3414   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
3415   ins_encode %{
3416     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
3417                  ExternalAddress(float_signflip()));
3418   %}
3419   ins_pipe(pipe_slow);
3420 %}
3421 
3422 instruct negD_reg(regD dst) %{
3423   predicate((UseSSE>=2) && (UseAVX == 0));
3424   match(Set dst (NegD dst));
3425   ins_cost(150);
3426   format %{ "xorpd   $dst, [0x8000000000000000]\t"
3427             "# neg double by sign flipping" %}
3428   ins_encode %{
3429     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
3430   %}
3431   ins_pipe(pipe_slow);
3432 %}
3433 
3434 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
3435   predicate(UseAVX > 0);
3436   match(Set dst (NegD src));
3437   ins_cost(150);
3438   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
3439             "# neg double by sign flipping" %}
3440   ins_encode %{
3441     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
3442                  ExternalAddress(double_signflip()));
3443   %}
3444   ins_pipe(pipe_slow);
3445 %}
3446 
3447 // sqrtss instruction needs destination register to be pre initialized for best performance
3448 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3449 instruct sqrtF_reg(regF dst) %{
3450   predicate(UseSSE>=1);
3451   match(Set dst (SqrtF dst));
3452   format %{ "sqrtss  $dst, $dst" %}
3453   ins_encode %{
3454     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
3455   %}
3456   ins_pipe(pipe_slow);
3457 %}
3458 
3459 // sqrtsd instruction needs destination register to be pre initialized for best performance
3460 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3461 instruct sqrtD_reg(regD dst) %{
3462   predicate(UseSSE>=2);
3463   match(Set dst (SqrtD dst));
3464   format %{ "sqrtsd  $dst, $dst" %}
3465   ins_encode %{
3466     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
3467   %}
3468   ins_pipe(pipe_slow);
3469 %}
3470 
3471 
3472 // ---------------------------------------- VectorReinterpret ------------------------------------
3473 instruct reinterpret_mask(kReg dst) %{
3474   predicate(n->bottom_type()->isa_vectmask() &&
3475             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
3476   match(Set dst (VectorReinterpret dst));
3477   ins_cost(125);
3478   format %{ "vector_reinterpret $dst\t!" %}
3479   ins_encode %{
3480     // empty
3481   %}
3482   ins_pipe( pipe_slow );
3483 %}
3484 
3485 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
3486   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3487             n->bottom_type()->isa_vectmask() &&
3488             n->in(1)->bottom_type()->isa_vectmask() &&
3489             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
3490             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3491   match(Set dst (VectorReinterpret src));
3492   effect(TEMP xtmp);
3493   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
3494   ins_encode %{
3495      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
3496      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3497      assert(src_sz == dst_sz , "src and dst size mismatch");
3498      int vlen_enc = vector_length_encoding(src_sz);
3499      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3500      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3501   %}
3502   ins_pipe( pipe_slow );
3503 %}
3504 
3505 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
3506   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3507             n->bottom_type()->isa_vectmask() &&
3508             n->in(1)->bottom_type()->isa_vectmask() &&
3509             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
3510              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
3511             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3512   match(Set dst (VectorReinterpret src));
3513   effect(TEMP xtmp);
3514   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
3515   ins_encode %{
3516      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
3517      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3518      assert(src_sz == dst_sz , "src and dst size mismatch");
3519      int vlen_enc = vector_length_encoding(src_sz);
3520      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3521      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3522   %}
3523   ins_pipe( pipe_slow );
3524 %}
3525 
3526 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
3527   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3528             n->bottom_type()->isa_vectmask() &&
3529             n->in(1)->bottom_type()->isa_vectmask() &&
3530             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
3531              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
3532             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3533   match(Set dst (VectorReinterpret src));
3534   effect(TEMP xtmp);
3535   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
3536   ins_encode %{
3537      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
3538      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3539      assert(src_sz == dst_sz , "src and dst size mismatch");
3540      int vlen_enc = vector_length_encoding(src_sz);
3541      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3542      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3543   %}
3544   ins_pipe( pipe_slow );
3545 %}
3546 
3547 instruct reinterpret(vec dst) %{
3548   predicate(!n->bottom_type()->isa_vectmask() &&
3549             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
3550   match(Set dst (VectorReinterpret dst));
3551   ins_cost(125);
3552   format %{ "vector_reinterpret $dst\t!" %}
3553   ins_encode %{
3554     // empty
3555   %}
3556   ins_pipe( pipe_slow );
3557 %}
3558 
3559 instruct reinterpret_expand(vec dst, vec src, rRegP scratch) %{
3560   predicate(UseAVX == 0 &&
3561             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3562   match(Set dst (VectorReinterpret src));
3563   ins_cost(125);
3564   effect(TEMP dst, TEMP scratch);
3565   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3566   ins_encode %{
3567     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
3568     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
3569 
3570     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
3571     if (src_vlen_in_bytes == 4) {
3572       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3573     } else {
3574       assert(src_vlen_in_bytes == 8, "");
3575       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), $scratch$$Register);
3576     }
3577     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3578   %}
3579   ins_pipe( pipe_slow );
3580 %}
3581 
3582 instruct vreinterpret_expand4(legVec dst, vec src, rRegP scratch) %{
3583   predicate(UseAVX > 0 &&
3584             !n->bottom_type()->isa_vectmask() &&
3585             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
3586             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3587   match(Set dst (VectorReinterpret src));
3588   ins_cost(125);
3589   effect(TEMP scratch);
3590   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3591   ins_encode %{
3592     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, $scratch$$Register);
3593   %}
3594   ins_pipe( pipe_slow );
3595 %}
3596 
3597 
3598 instruct vreinterpret_expand(legVec dst, vec src) %{
3599   predicate(UseAVX > 0 &&
3600             !n->bottom_type()->isa_vectmask() &&
3601             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
3602             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3603   match(Set dst (VectorReinterpret src));
3604   ins_cost(125);
3605   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
3606   ins_encode %{
3607     switch (Matcher::vector_length_in_bytes(this, $src)) {
3608       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3609       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3610       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3611       default: ShouldNotReachHere();
3612     }
3613   %}
3614   ins_pipe( pipe_slow );
3615 %}
3616 
3617 instruct reinterpret_shrink(vec dst, legVec src) %{
3618   predicate(!n->bottom_type()->isa_vectmask() &&
3619             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
3620   match(Set dst (VectorReinterpret src));
3621   ins_cost(125);
3622   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
3623   ins_encode %{
3624     switch (Matcher::vector_length_in_bytes(this)) {
3625       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
3626       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3627       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3628       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3629       default: ShouldNotReachHere();
3630     }
3631   %}
3632   ins_pipe( pipe_slow );
3633 %}
3634 
3635 // ----------------------------------------------------------------------------------------------------
3636 
3637 #ifdef _LP64
3638 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
3639   match(Set dst (RoundDoubleMode src rmode));
3640   format %{ "roundsd $dst,$src" %}
3641   ins_cost(150);
3642   ins_encode %{
3643     assert(UseSSE >= 4, "required");
3644     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
3645   %}
3646   ins_pipe(pipe_slow);
3647 %}
3648 
3649 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
3650   match(Set dst (RoundDoubleMode (LoadD src) rmode));
3651   format %{ "roundsd $dst,$src" %}
3652   ins_cost(150);
3653   ins_encode %{
3654     assert(UseSSE >= 4, "required");
3655     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
3656   %}
3657   ins_pipe(pipe_slow);
3658 %}
3659 
3660 instruct roundD_imm(legRegD dst, immD con, immU8 rmode, rRegI scratch_reg) %{
3661   match(Set dst (RoundDoubleMode con rmode));
3662   effect(TEMP scratch_reg);
3663   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
3664   ins_cost(150);
3665   ins_encode %{
3666     assert(UseSSE >= 4, "required");
3667     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, $scratch_reg$$Register);
3668   %}
3669   ins_pipe(pipe_slow);
3670 %}
3671 
3672 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
3673   predicate(Matcher::vector_length(n) < 8);
3674   match(Set dst (RoundDoubleModeV src rmode));
3675   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
3676   ins_encode %{
3677     assert(UseAVX > 0, "required");
3678     int vlen_enc = vector_length_encoding(this);
3679     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
3680   %}
3681   ins_pipe( pipe_slow );
3682 %}
3683 
3684 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
3685   predicate(Matcher::vector_length(n) == 8);
3686   match(Set dst (RoundDoubleModeV src rmode));
3687   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
3688   ins_encode %{
3689     assert(UseAVX > 2, "required");
3690     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
3691   %}
3692   ins_pipe( pipe_slow );
3693 %}
3694 
3695 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
3696   predicate(Matcher::vector_length(n) < 8);
3697   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3698   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
3699   ins_encode %{
3700     assert(UseAVX > 0, "required");
3701     int vlen_enc = vector_length_encoding(this);
3702     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
3703   %}
3704   ins_pipe( pipe_slow );
3705 %}
3706 
3707 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
3708   predicate(Matcher::vector_length(n) == 8);
3709   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3710   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
3711   ins_encode %{
3712     assert(UseAVX > 2, "required");
3713     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
3714   %}
3715   ins_pipe( pipe_slow );
3716 %}
3717 #endif // _LP64
3718 
3719 instruct onspinwait() %{
3720   match(OnSpinWait);
3721   ins_cost(200);
3722 
3723   format %{
3724     $$template
3725     $$emit$$"pause\t! membar_onspinwait"
3726   %}
3727   ins_encode %{
3728     __ pause();
3729   %}
3730   ins_pipe(pipe_slow);
3731 %}
3732 
3733 // a * b + c
3734 instruct fmaD_reg(regD a, regD b, regD c) %{
3735   predicate(UseFMA);
3736   match(Set c (FmaD  c (Binary a b)));
3737   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
3738   ins_cost(150);
3739   ins_encode %{
3740     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3741   %}
3742   ins_pipe( pipe_slow );
3743 %}
3744 
3745 // a * b + c
3746 instruct fmaF_reg(regF a, regF b, regF c) %{
3747   predicate(UseFMA);
3748   match(Set c (FmaF  c (Binary a b)));
3749   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
3750   ins_cost(150);
3751   ins_encode %{
3752     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3753   %}
3754   ins_pipe( pipe_slow );
3755 %}
3756 
3757 // ====================VECTOR INSTRUCTIONS=====================================
3758 
3759 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
3760 instruct MoveVec2Leg(legVec dst, vec src) %{
3761   match(Set dst src);
3762   format %{ "" %}
3763   ins_encode %{
3764     ShouldNotReachHere();
3765   %}
3766   ins_pipe( fpu_reg_reg );
3767 %}
3768 
3769 instruct MoveLeg2Vec(vec dst, legVec src) %{
3770   match(Set dst src);
3771   format %{ "" %}
3772   ins_encode %{
3773     ShouldNotReachHere();
3774   %}
3775   ins_pipe( fpu_reg_reg );
3776 %}
3777 
3778 // ============================================================================
3779 
3780 // Load vectors generic operand pattern
3781 instruct loadV(vec dst, memory mem) %{
3782   match(Set dst (LoadVector mem));
3783   ins_cost(125);
3784   format %{ "load_vector $dst,$mem" %}
3785   ins_encode %{
3786     switch (Matcher::vector_length_in_bytes(this)) {
3787       case  4: __ movdl    ($dst$$XMMRegister, $mem$$Address); break;
3788       case  8: __ movq     ($dst$$XMMRegister, $mem$$Address); break;
3789       case 16: __ movdqu   ($dst$$XMMRegister, $mem$$Address); break;
3790       case 32: __ vmovdqu  ($dst$$XMMRegister, $mem$$Address); break;
3791       case 64: __ evmovdqul($dst$$XMMRegister, $mem$$Address, Assembler::AVX_512bit); break;
3792       default: ShouldNotReachHere();
3793     }
3794   %}
3795   ins_pipe( pipe_slow );
3796 %}
3797 
3798 // Store vectors generic operand pattern.
3799 instruct storeV(memory mem, vec src) %{
3800   match(Set mem (StoreVector mem src));
3801   ins_cost(145);
3802   format %{ "store_vector $mem,$src\n\t" %}
3803   ins_encode %{
3804     switch (Matcher::vector_length_in_bytes(this, $src)) {
3805       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
3806       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
3807       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
3808       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
3809       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
3810       default: ShouldNotReachHere();
3811     }
3812   %}
3813   ins_pipe( pipe_slow );
3814 %}
3815 
3816 // ---------------------------------------- Gather ------------------------------------
3817 
3818 // Gather INT, LONG, FLOAT, DOUBLE
3819 
3820 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
3821   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
3822   match(Set dst (LoadVectorGather mem idx));
3823   effect(TEMP dst, TEMP tmp, TEMP mask);
3824   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
3825   ins_encode %{
3826     assert(UseAVX >= 2, "sanity");
3827 
3828     int vlen_enc = vector_length_encoding(this);
3829     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3830 
3831     assert(Matcher::vector_length_in_bytes(this) >= 16, "sanity");
3832     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3833 
3834     if (vlen_enc == Assembler::AVX_128bit) {
3835       __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3836     } else {
3837       __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3838     }
3839     __ lea($tmp$$Register, $mem$$Address);
3840     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
3841   %}
3842   ins_pipe( pipe_slow );
3843 %}
3844 
3845 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
3846   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
3847   match(Set dst (LoadVectorGather mem idx));
3848   effect(TEMP dst, TEMP tmp, TEMP ktmp);
3849   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
3850   ins_encode %{
3851     assert(UseAVX > 2, "sanity");
3852 
3853     int vlen_enc = vector_length_encoding(this);
3854     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3855 
3856     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3857 
3858     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3859     __ lea($tmp$$Register, $mem$$Address);
3860     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
3861   %}
3862   ins_pipe( pipe_slow );
3863 %}
3864 
3865 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
3866   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
3867   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
3868   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
3869   ins_encode %{
3870     assert(UseAVX > 2, "sanity");
3871     int vlen_enc = vector_length_encoding(this);
3872     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3873     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3874     // Note: Since gather instruction partially updates the opmask register used
3875     // for predication hense moving mask operand to a temporary.
3876     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
3877     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3878     __ lea($tmp$$Register, $mem$$Address);
3879     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
3880   %}
3881   ins_pipe( pipe_slow );
3882 %}
3883 // ====================Scatter=======================================
3884 
3885 // Scatter INT, LONG, FLOAT, DOUBLE
3886 
3887 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
3888   predicate(UseAVX > 2);
3889   match(Set mem (StoreVectorScatter mem (Binary src idx)));
3890   effect(TEMP tmp, TEMP ktmp);
3891   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
3892   ins_encode %{
3893     int vlen_enc = vector_length_encoding(this, $src);
3894     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
3895 
3896     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
3897     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3898 
3899     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3900     __ lea($tmp$$Register, $mem$$Address);
3901     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
3902   %}
3903   ins_pipe( pipe_slow );
3904 %}
3905 
3906 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
3907   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
3908   effect(TEMP tmp, TEMP ktmp);
3909   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
3910   ins_encode %{
3911     int vlen_enc = vector_length_encoding(this, $src);
3912     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
3913     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
3914     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3915     // Note: Since scatter instruction partially updates the opmask register used
3916     // for predication hense moving mask operand to a temporary.
3917     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
3918     __ lea($tmp$$Register, $mem$$Address);
3919     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
3920   %}
3921   ins_pipe( pipe_slow );
3922 %}
3923 
3924 // ====================REPLICATE=======================================
3925 
3926 // Replicate byte scalar to be vector
3927 instruct ReplB_reg(vec dst, rRegI src) %{
3928   match(Set dst (ReplicateB src));
3929   format %{ "replicateB $dst,$src" %}
3930   ins_encode %{
3931     uint vlen = Matcher::vector_length(this);
3932     if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
3933       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
3934       int vlen_enc = vector_length_encoding(this);
3935       __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
3936     } else if (VM_Version::supports_avx2()) {
3937       int vlen_enc = vector_length_encoding(this);
3938       __ movdl($dst$$XMMRegister, $src$$Register);
3939       __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3940     } else {
3941       __ movdl($dst$$XMMRegister, $src$$Register);
3942       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3943       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3944       if (vlen >= 16) {
3945         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3946         if (vlen >= 32) {
3947           assert(vlen == 32, "sanity");
3948           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3949         }
3950       }
3951     }
3952   %}
3953   ins_pipe( pipe_slow );
3954 %}
3955 
3956 instruct ReplB_mem(vec dst, memory mem) %{
3957   predicate(VM_Version::supports_avx2());
3958   match(Set dst (ReplicateB (LoadB mem)));
3959   format %{ "replicateB $dst,$mem" %}
3960   ins_encode %{
3961     int vlen_enc = vector_length_encoding(this);
3962     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
3963   %}
3964   ins_pipe( pipe_slow );
3965 %}
3966 
3967 instruct ReplB_imm(vec dst, immI con) %{
3968   match(Set dst (ReplicateB con));
3969   format %{ "replicateB $dst,$con" %}
3970   ins_encode %{
3971     uint vlen = Matcher::vector_length(this);
3972     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 1));
3973     if (vlen == 4) {
3974       __ movdl($dst$$XMMRegister, const_addr);
3975     } else {
3976       __ movq($dst$$XMMRegister, const_addr);
3977       if (vlen >= 16) {
3978         if (VM_Version::supports_avx2()) {
3979           int vlen_enc = vector_length_encoding(this);
3980           __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3981         } else {
3982           assert(vlen == 16, "sanity");
3983           __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3984         }
3985       }
3986     }
3987   %}
3988   ins_pipe( pipe_slow );
3989 %}
3990 
3991 // Replicate byte scalar zero to be vector
3992 instruct ReplB_zero(vec dst, immI_0 zero) %{
3993   match(Set dst (ReplicateB zero));
3994   format %{ "replicateB $dst,$zero" %}
3995   ins_encode %{
3996     uint vlen = Matcher::vector_length(this);
3997     if (vlen <= 16) {
3998       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3999     } else {
4000       // Use vpxor since AVX512F does not have 512bit vxorpd (requires AVX512DQ).
4001       int vlen_enc = vector_length_encoding(this);
4002       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4003     }
4004   %}
4005   ins_pipe( fpu_reg_reg );
4006 %}
4007 
4008 // ====================ReplicateS=======================================
4009 
4010 instruct ReplS_reg(vec dst, rRegI src) %{
4011   match(Set dst (ReplicateS src));
4012   format %{ "replicateS $dst,$src" %}
4013   ins_encode %{
4014     uint vlen = Matcher::vector_length(this);
4015     if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
4016       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
4017       int vlen_enc = vector_length_encoding(this);
4018       __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
4019     } else if (VM_Version::supports_avx2()) {
4020       int vlen_enc = vector_length_encoding(this);
4021       __ movdl($dst$$XMMRegister, $src$$Register);
4022       __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4023     } else {
4024       __ movdl($dst$$XMMRegister, $src$$Register);
4025       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4026       if (vlen >= 8) {
4027         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4028         if (vlen >= 16) {
4029           assert(vlen == 16, "sanity");
4030           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4031         }
4032       }
4033     }
4034   %}
4035   ins_pipe( pipe_slow );
4036 %}
4037 
4038 instruct ReplS_mem(vec dst, memory mem) %{
4039   predicate(VM_Version::supports_avx2());
4040   match(Set dst (ReplicateS (LoadS mem)));
4041   format %{ "replicateS $dst,$mem" %}
4042   ins_encode %{
4043     int vlen_enc = vector_length_encoding(this);
4044     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
4045   %}
4046   ins_pipe( pipe_slow );
4047 %}
4048 
4049 instruct ReplS_imm(vec dst, immI con) %{
4050   match(Set dst (ReplicateS con));
4051   format %{ "replicateS $dst,$con" %}
4052   ins_encode %{
4053     uint vlen = Matcher::vector_length(this);
4054     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 2));
4055     if (vlen == 2) {
4056       __ movdl($dst$$XMMRegister, const_addr);
4057     } else {
4058       __ movq($dst$$XMMRegister, const_addr);
4059       if (vlen >= 8) {
4060         if (VM_Version::supports_avx2()) {
4061           int vlen_enc = vector_length_encoding(this);
4062           __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4063         } else {
4064           assert(vlen == 8, "sanity");
4065           __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4066         }
4067       }
4068     }
4069   %}
4070   ins_pipe( fpu_reg_reg );
4071 %}
4072 
4073 instruct ReplS_zero(vec dst, immI_0 zero) %{
4074   match(Set dst (ReplicateS zero));
4075   format %{ "replicateS $dst,$zero" %}
4076   ins_encode %{
4077     uint vlen = Matcher::vector_length(this);
4078     if (vlen <= 8) {
4079       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4080     } else {
4081       int vlen_enc = vector_length_encoding(this);
4082       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4083     }
4084   %}
4085   ins_pipe( fpu_reg_reg );
4086 %}
4087 
4088 // ====================ReplicateI=======================================
4089 
4090 instruct ReplI_reg(vec dst, rRegI src) %{
4091   match(Set dst (ReplicateI src));
4092   format %{ "replicateI $dst,$src" %}
4093   ins_encode %{
4094     uint vlen = Matcher::vector_length(this);
4095     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4096       int vlen_enc = vector_length_encoding(this);
4097       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
4098     } else if (VM_Version::supports_avx2()) {
4099       int vlen_enc = vector_length_encoding(this);
4100       __ movdl($dst$$XMMRegister, $src$$Register);
4101       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4102     } else {
4103       __ movdl($dst$$XMMRegister, $src$$Register);
4104       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4105       if (vlen >= 8) {
4106         assert(vlen == 8, "sanity");
4107         __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4108       }
4109     }
4110   %}
4111   ins_pipe( pipe_slow );
4112 %}
4113 
4114 instruct ReplI_mem(vec dst, memory mem) %{
4115   match(Set dst (ReplicateI (LoadI mem)));
4116   format %{ "replicateI $dst,$mem" %}
4117   ins_encode %{
4118     uint vlen = Matcher::vector_length(this);
4119     if (vlen <= 4) {
4120       __ movdl($dst$$XMMRegister, $mem$$Address);
4121       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4122     } else {
4123       assert(VM_Version::supports_avx2(), "sanity");
4124       int vlen_enc = vector_length_encoding(this);
4125       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4126     }
4127   %}
4128   ins_pipe( pipe_slow );
4129 %}
4130 
4131 instruct ReplI_imm(vec dst, immI con) %{
4132   match(Set dst (ReplicateI con));
4133   format %{ "replicateI $dst,$con" %}
4134   ins_encode %{
4135     uint vlen = Matcher::vector_length(this);
4136     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 4));
4137     if (vlen <= 4) {
4138       __ movq($dst$$XMMRegister, const_addr);
4139       if (vlen == 4) {
4140         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4141       }
4142     } else {
4143       assert(VM_Version::supports_avx2(), "sanity");
4144       int vlen_enc = vector_length_encoding(this);
4145       __ movq($dst$$XMMRegister, const_addr);
4146       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4147     }
4148   %}
4149   ins_pipe( pipe_slow );
4150 %}
4151 
4152 // Replicate integer (4 byte) scalar zero to be vector
4153 instruct ReplI_zero(vec dst, immI_0 zero) %{
4154   match(Set dst (ReplicateI zero));
4155   format %{ "replicateI $dst,$zero" %}
4156   ins_encode %{
4157     uint vlen = Matcher::vector_length(this);
4158     if (vlen <= 4) {
4159       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4160     } else {
4161       int vlen_enc = vector_length_encoding(this);
4162       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4163     }
4164   %}
4165   ins_pipe( fpu_reg_reg );
4166 %}
4167 
4168 instruct ReplI_M1(vec dst, immI_M1 con) %{
4169   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) >= 16);
4170   match(Set dst (ReplicateB con));
4171   match(Set dst (ReplicateS con));
4172   match(Set dst (ReplicateI con));
4173   effect(TEMP dst);
4174   format %{ "vallones $dst" %}
4175   ins_encode %{
4176     int vector_len = vector_length_encoding(this);
4177     __ vallones($dst$$XMMRegister, vector_len);
4178   %}
4179   ins_pipe( pipe_slow );
4180 %}
4181 
4182 // ====================ReplicateL=======================================
4183 
4184 #ifdef _LP64
4185 // Replicate long (8 byte) scalar to be vector
4186 instruct ReplL_reg(vec dst, rRegL src) %{
4187   match(Set dst (ReplicateL src));
4188   format %{ "replicateL $dst,$src" %}
4189   ins_encode %{
4190     uint vlen = Matcher::vector_length(this);
4191     if (vlen == 2) {
4192       __ movdq($dst$$XMMRegister, $src$$Register);
4193       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4194     } else if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4195       int vlen_enc = vector_length_encoding(this);
4196       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
4197     } else if (VM_Version::supports_avx2()) {
4198       assert(vlen == 4, "sanity");
4199       int vlen_enc = vector_length_encoding(this);
4200       __ movdq($dst$$XMMRegister, $src$$Register);
4201       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4202     } else {
4203       assert(vlen == 4, "sanity");
4204       __ movdq($dst$$XMMRegister, $src$$Register);
4205       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4206       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4207     }
4208   %}
4209   ins_pipe( pipe_slow );
4210 %}
4211 #else // _LP64
4212 // Replicate long (8 byte) scalar to be vector
4213 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
4214   predicate(Matcher::vector_length(n) <= 4);
4215   match(Set dst (ReplicateL src));
4216   effect(TEMP dst, USE src, TEMP tmp);
4217   format %{ "replicateL $dst,$src" %}
4218   ins_encode %{
4219     uint vlen = Matcher::vector_length(this);
4220     if (vlen == 2) {
4221       __ movdl($dst$$XMMRegister, $src$$Register);
4222       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4223       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4224       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4225     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4226       int vlen_enc = Assembler::AVX_256bit;
4227       __ movdl($dst$$XMMRegister, $src$$Register);
4228       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4229       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4230       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4231     } else {
4232       __ movdl($dst$$XMMRegister, $src$$Register);
4233       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4234       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4235       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4236       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4237     }
4238   %}
4239   ins_pipe( pipe_slow );
4240 %}
4241 
4242 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
4243   predicate(Matcher::vector_length(n) == 8);
4244   match(Set dst (ReplicateL src));
4245   effect(TEMP dst, USE src, TEMP tmp);
4246   format %{ "replicateL $dst,$src" %}
4247   ins_encode %{
4248     if (VM_Version::supports_avx512vl()) {
4249       __ movdl($dst$$XMMRegister, $src$$Register);
4250       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4251       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4252       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4253       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4254       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4255     } else {
4256       int vlen_enc = Assembler::AVX_512bit;
4257       __ movdl($dst$$XMMRegister, $src$$Register);
4258       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4259       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4260       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4261     }
4262   %}
4263   ins_pipe( pipe_slow );
4264 %}
4265 #endif // _LP64
4266 
4267 instruct ReplL_mem(vec dst, memory mem) %{
4268   match(Set dst (ReplicateL (LoadL mem)));
4269   format %{ "replicateL $dst,$mem" %}
4270   ins_encode %{
4271     uint vlen = Matcher::vector_length(this);
4272     if (vlen == 2) {
4273       __ movq($dst$$XMMRegister, $mem$$Address);
4274       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4275     } else {
4276       assert(VM_Version::supports_avx2(), "sanity");
4277       int vlen_enc = vector_length_encoding(this);
4278       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
4279     }
4280   %}
4281   ins_pipe( pipe_slow );
4282 %}
4283 
4284 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4285 instruct ReplL_imm(vec dst, immL con) %{
4286   match(Set dst (ReplicateL con));
4287   format %{ "replicateL $dst,$con" %}
4288   ins_encode %{
4289     uint vlen = Matcher::vector_length(this);
4290     InternalAddress const_addr = $constantaddress($con);
4291     if (vlen == 2) {
4292       __ movq($dst$$XMMRegister, const_addr);
4293       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4294     } else {
4295       assert(VM_Version::supports_avx2(), "sanity");
4296       int vlen_enc = vector_length_encoding(this);
4297       __ movq($dst$$XMMRegister, const_addr);
4298       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4299     }
4300   %}
4301   ins_pipe( pipe_slow );
4302 %}
4303 
4304 instruct ReplL_zero(vec dst, immL0 zero) %{
4305   match(Set dst (ReplicateL zero));
4306   format %{ "replicateL $dst,$zero" %}
4307   ins_encode %{
4308     int vlen = Matcher::vector_length(this);
4309     if (vlen == 2) {
4310       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4311     } else {
4312       int vlen_enc = vector_length_encoding(this);
4313       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4314     }
4315   %}
4316   ins_pipe( fpu_reg_reg );
4317 %}
4318 
4319 instruct ReplL_M1(vec dst, immL_M1 con) %{
4320   predicate(UseAVX > 0);
4321   match(Set dst (ReplicateL con));
4322   effect(TEMP dst);
4323   format %{ "vallones $dst" %}
4324   ins_encode %{
4325     int vector_len = vector_length_encoding(this);
4326     __ vallones($dst$$XMMRegister, vector_len);
4327   %}
4328   ins_pipe( pipe_slow );
4329 %}
4330 
4331 // ====================ReplicateF=======================================
4332 
4333 instruct ReplF_reg(vec dst, vlRegF src) %{
4334   match(Set dst (ReplicateF src));
4335   format %{ "replicateF $dst,$src" %}
4336   ins_encode %{
4337     uint vlen = Matcher::vector_length(this);
4338     if (vlen <= 4) {
4339       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4340    } else if (VM_Version::supports_avx2()) {
4341       int vlen_enc = vector_length_encoding(this);
4342       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4343     } else {
4344       assert(vlen == 8, "sanity");
4345       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4346       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4347     }
4348   %}
4349   ins_pipe( pipe_slow );
4350 %}
4351 
4352 instruct ReplF_mem(vec dst, memory mem) %{
4353   match(Set dst (ReplicateF (LoadF mem)));
4354   format %{ "replicateF $dst,$mem" %}
4355   ins_encode %{
4356     uint vlen = Matcher::vector_length(this);
4357     if (vlen <= 4) {
4358       __ movdl($dst$$XMMRegister, $mem$$Address);
4359       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4360     } else {
4361       assert(VM_Version::supports_avx(), "sanity");
4362       int vlen_enc = vector_length_encoding(this);
4363       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
4364     }
4365   %}
4366   ins_pipe( pipe_slow );
4367 %}
4368 
4369 instruct ReplF_zero(vec dst, immF0 zero) %{
4370   match(Set dst (ReplicateF zero));
4371   format %{ "replicateF $dst,$zero" %}
4372   ins_encode %{
4373     uint vlen = Matcher::vector_length(this);
4374     if (vlen <= 4) {
4375       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4376     } else {
4377       int vlen_enc = vector_length_encoding(this);
4378       __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4379     }
4380   %}
4381   ins_pipe( fpu_reg_reg );
4382 %}
4383 
4384 // ====================ReplicateD=======================================
4385 
4386 // Replicate double (8 bytes) scalar to be vector
4387 instruct ReplD_reg(vec dst, vlRegD src) %{
4388   match(Set dst (ReplicateD src));
4389   format %{ "replicateD $dst,$src" %}
4390   ins_encode %{
4391     uint vlen = Matcher::vector_length(this);
4392     if (vlen == 2) {
4393       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4394     } else if (VM_Version::supports_avx2()) {
4395       int vlen_enc = vector_length_encoding(this);
4396       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4397     } else {
4398       assert(vlen == 4, "sanity");
4399       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4400       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4401     }
4402   %}
4403   ins_pipe( pipe_slow );
4404 %}
4405 
4406 instruct ReplD_mem(vec dst, memory mem) %{
4407   match(Set dst (ReplicateD (LoadD mem)));
4408   format %{ "replicateD $dst,$mem" %}
4409   ins_encode %{
4410     uint vlen = Matcher::vector_length(this);
4411     if (vlen == 2) {
4412       __ movq($dst$$XMMRegister, $mem$$Address);
4413       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44);
4414     } else {
4415       assert(VM_Version::supports_avx(), "sanity");
4416       int vlen_enc = vector_length_encoding(this);
4417       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4418     }
4419   %}
4420   ins_pipe( pipe_slow );
4421 %}
4422 
4423 instruct ReplD_zero(vec dst, immD0 zero) %{
4424   match(Set dst (ReplicateD zero));
4425   format %{ "replicateD $dst,$zero" %}
4426   ins_encode %{
4427     uint vlen = Matcher::vector_length(this);
4428     if (vlen == 2) {
4429       __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4430     } else {
4431       int vlen_enc = vector_length_encoding(this);
4432       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4433     }
4434   %}
4435   ins_pipe( fpu_reg_reg );
4436 %}
4437 
4438 // ====================VECTOR INSERT=======================================
4439 
4440 instruct insert(vec dst, rRegI val, immU8 idx) %{
4441   predicate(Matcher::vector_length_in_bytes(n) < 32);
4442   match(Set dst (VectorInsert (Binary dst val) idx));
4443   format %{ "vector_insert $dst,$val,$idx" %}
4444   ins_encode %{
4445     assert(UseSSE >= 4, "required");
4446     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
4447 
4448     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4449 
4450     assert(is_integral_type(elem_bt), "");
4451     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4452 
4453     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
4454   %}
4455   ins_pipe( pipe_slow );
4456 %}
4457 
4458 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
4459   predicate(Matcher::vector_length_in_bytes(n) == 32);
4460   match(Set dst (VectorInsert (Binary src val) idx));
4461   effect(TEMP vtmp);
4462   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4463   ins_encode %{
4464     int vlen_enc = Assembler::AVX_256bit;
4465     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4466     int elem_per_lane = 16/type2aelembytes(elem_bt);
4467     int log2epr = log2(elem_per_lane);
4468 
4469     assert(is_integral_type(elem_bt), "sanity");
4470     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4471 
4472     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4473     uint y_idx = ($idx$$constant >> log2epr) & 1;
4474     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4475     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4476     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4477   %}
4478   ins_pipe( pipe_slow );
4479 %}
4480 
4481 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
4482   predicate(Matcher::vector_length_in_bytes(n) == 64);
4483   match(Set dst (VectorInsert (Binary src val) idx));
4484   effect(TEMP vtmp);
4485   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4486   ins_encode %{
4487     assert(UseAVX > 2, "sanity");
4488 
4489     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4490     int elem_per_lane = 16/type2aelembytes(elem_bt);
4491     int log2epr = log2(elem_per_lane);
4492 
4493     assert(is_integral_type(elem_bt), "");
4494     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4495 
4496     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4497     uint y_idx = ($idx$$constant >> log2epr) & 3;
4498     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4499     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4500     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4501   %}
4502   ins_pipe( pipe_slow );
4503 %}
4504 
4505 #ifdef _LP64
4506 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
4507   predicate(Matcher::vector_length(n) == 2);
4508   match(Set dst (VectorInsert (Binary dst val) idx));
4509   format %{ "vector_insert $dst,$val,$idx" %}
4510   ins_encode %{
4511     assert(UseSSE >= 4, "required");
4512     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4513     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4514 
4515     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
4516   %}
4517   ins_pipe( pipe_slow );
4518 %}
4519 
4520 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
4521   predicate(Matcher::vector_length(n) == 4);
4522   match(Set dst (VectorInsert (Binary src val) idx));
4523   effect(TEMP vtmp);
4524   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4525   ins_encode %{
4526     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4527     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4528 
4529     uint x_idx = $idx$$constant & right_n_bits(1);
4530     uint y_idx = ($idx$$constant >> 1) & 1;
4531     int vlen_enc = Assembler::AVX_256bit;
4532     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4533     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4534     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4535   %}
4536   ins_pipe( pipe_slow );
4537 %}
4538 
4539 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
4540   predicate(Matcher::vector_length(n) == 8);
4541   match(Set dst (VectorInsert (Binary src val) idx));
4542   effect(TEMP vtmp);
4543   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4544   ins_encode %{
4545     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
4546     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4547 
4548     uint x_idx = $idx$$constant & right_n_bits(1);
4549     uint y_idx = ($idx$$constant >> 1) & 3;
4550     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4551     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4552     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4553   %}
4554   ins_pipe( pipe_slow );
4555 %}
4556 #endif
4557 
4558 instruct insertF(vec dst, regF val, immU8 idx) %{
4559   predicate(Matcher::vector_length(n) < 8);
4560   match(Set dst (VectorInsert (Binary dst val) idx));
4561   format %{ "vector_insert $dst,$val,$idx" %}
4562   ins_encode %{
4563     assert(UseSSE >= 4, "sanity");
4564 
4565     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4566     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4567 
4568     __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
4569   %}
4570   ins_pipe( pipe_slow );
4571 %}
4572 
4573 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
4574   predicate(Matcher::vector_length(n) >= 8);
4575   match(Set dst (VectorInsert (Binary src val) idx));
4576   effect(TEMP vtmp);
4577   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4578   ins_encode %{
4579     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4580     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4581 
4582     int vlen = Matcher::vector_length(this);
4583     uint x_idx = $idx$$constant & right_n_bits(2);
4584     if (vlen == 8) {
4585       uint y_idx = ($idx$$constant >> 2) & 1;
4586       int vlen_enc = Assembler::AVX_256bit;
4587       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4588       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4589       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4590     } else {
4591       assert(vlen == 16, "sanity");
4592       uint y_idx = ($idx$$constant >> 2) & 3;
4593       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4594       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4595       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4596     }
4597   %}
4598   ins_pipe( pipe_slow );
4599 %}
4600 
4601 #ifdef _LP64
4602 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
4603   predicate(Matcher::vector_length(n) == 2);
4604   match(Set dst (VectorInsert (Binary dst val) idx));
4605   effect(TEMP tmp);
4606   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
4607   ins_encode %{
4608     assert(UseSSE >= 4, "sanity");
4609     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4610     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4611 
4612     __ movq($tmp$$Register, $val$$XMMRegister);
4613     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
4614   %}
4615   ins_pipe( pipe_slow );
4616 %}
4617 
4618 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
4619   predicate(Matcher::vector_length(n) == 4);
4620   match(Set dst (VectorInsert (Binary src val) idx));
4621   effect(TEMP vtmp, TEMP tmp);
4622   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
4623   ins_encode %{
4624     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4625     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4626 
4627     uint x_idx = $idx$$constant & right_n_bits(1);
4628     uint y_idx = ($idx$$constant >> 1) & 1;
4629     int vlen_enc = Assembler::AVX_256bit;
4630     __ movq($tmp$$Register, $val$$XMMRegister);
4631     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4632     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4633     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4634   %}
4635   ins_pipe( pipe_slow );
4636 %}
4637 
4638 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
4639   predicate(Matcher::vector_length(n) == 8);
4640   match(Set dst (VectorInsert (Binary src val) idx));
4641   effect(TEMP tmp, TEMP vtmp);
4642   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4643   ins_encode %{
4644     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4645     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4646 
4647     uint x_idx = $idx$$constant & right_n_bits(1);
4648     uint y_idx = ($idx$$constant >> 1) & 3;
4649     __ movq($tmp$$Register, $val$$XMMRegister);
4650     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4651     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4652     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4653   %}
4654   ins_pipe( pipe_slow );
4655 %}
4656 #endif
4657 
4658 // ====================REDUCTION ARITHMETIC=======================================
4659 
4660 // =======================Int Reduction==========================================
4661 
4662 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4663   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
4664   match(Set dst (AddReductionVI src1 src2));
4665   match(Set dst (MulReductionVI src1 src2));
4666   match(Set dst (AndReductionV  src1 src2));
4667   match(Set dst ( OrReductionV  src1 src2));
4668   match(Set dst (XorReductionV  src1 src2));
4669   match(Set dst (MinReductionV  src1 src2));
4670   match(Set dst (MaxReductionV  src1 src2));
4671   effect(TEMP vtmp1, TEMP vtmp2);
4672   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4673   ins_encode %{
4674     int opcode = this->ideal_Opcode();
4675     int vlen = Matcher::vector_length(this, $src2);
4676     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4677   %}
4678   ins_pipe( pipe_slow );
4679 %}
4680 
4681 // =======================Long Reduction==========================================
4682 
4683 #ifdef _LP64
4684 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4685   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
4686   match(Set dst (AddReductionVL src1 src2));
4687   match(Set dst (MulReductionVL src1 src2));
4688   match(Set dst (AndReductionV  src1 src2));
4689   match(Set dst ( OrReductionV  src1 src2));
4690   match(Set dst (XorReductionV  src1 src2));
4691   match(Set dst (MinReductionV  src1 src2));
4692   match(Set dst (MaxReductionV  src1 src2));
4693   effect(TEMP vtmp1, TEMP vtmp2);
4694   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4695   ins_encode %{
4696     int opcode = this->ideal_Opcode();
4697     int vlen = Matcher::vector_length(this, $src2);
4698     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4699   %}
4700   ins_pipe( pipe_slow );
4701 %}
4702 
4703 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
4704   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
4705   match(Set dst (AddReductionVL src1 src2));
4706   match(Set dst (MulReductionVL src1 src2));
4707   match(Set dst (AndReductionV  src1 src2));
4708   match(Set dst ( OrReductionV  src1 src2));
4709   match(Set dst (XorReductionV  src1 src2));
4710   match(Set dst (MinReductionV  src1 src2));
4711   match(Set dst (MaxReductionV  src1 src2));
4712   effect(TEMP vtmp1, TEMP vtmp2);
4713   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4714   ins_encode %{
4715     int opcode = this->ideal_Opcode();
4716     int vlen = Matcher::vector_length(this, $src2);
4717     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4718   %}
4719   ins_pipe( pipe_slow );
4720 %}
4721 #endif // _LP64
4722 
4723 // =======================Float Reduction==========================================
4724 
4725 instruct reductionF128(regF dst, vec src, vec vtmp) %{
4726   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
4727   match(Set dst (AddReductionVF dst src));
4728   match(Set dst (MulReductionVF dst src));
4729   effect(TEMP dst, TEMP vtmp);
4730   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
4731   ins_encode %{
4732     int opcode = this->ideal_Opcode();
4733     int vlen = Matcher::vector_length(this, $src);
4734     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4735   %}
4736   ins_pipe( pipe_slow );
4737 %}
4738 
4739 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
4740   predicate(Matcher::vector_length(n->in(2)) == 8); // src
4741   match(Set dst (AddReductionVF dst src));
4742   match(Set dst (MulReductionVF dst src));
4743   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4744   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4745   ins_encode %{
4746     int opcode = this->ideal_Opcode();
4747     int vlen = Matcher::vector_length(this, $src);
4748     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4749   %}
4750   ins_pipe( pipe_slow );
4751 %}
4752 
4753 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4754   predicate(Matcher::vector_length(n->in(2)) == 16); // src
4755   match(Set dst (AddReductionVF dst src));
4756   match(Set dst (MulReductionVF dst src));
4757   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4758   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4759   ins_encode %{
4760     int opcode = this->ideal_Opcode();
4761     int vlen = Matcher::vector_length(this, $src);
4762     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4763   %}
4764   ins_pipe( pipe_slow );
4765 %}
4766 
4767 // =======================Double Reduction==========================================
4768 
4769 instruct reduction2D(regD dst, vec src, vec vtmp) %{
4770   predicate(Matcher::vector_length(n->in(2)) == 2); // src
4771   match(Set dst (AddReductionVD dst src));
4772   match(Set dst (MulReductionVD dst src));
4773   effect(TEMP dst, TEMP vtmp);
4774   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
4775   ins_encode %{
4776     int opcode = this->ideal_Opcode();
4777     int vlen = Matcher::vector_length(this, $src);
4778     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4779 %}
4780   ins_pipe( pipe_slow );
4781 %}
4782 
4783 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
4784   predicate(Matcher::vector_length(n->in(2)) == 4); // src
4785   match(Set dst (AddReductionVD dst src));
4786   match(Set dst (MulReductionVD dst src));
4787   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4788   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4789   ins_encode %{
4790     int opcode = this->ideal_Opcode();
4791     int vlen = Matcher::vector_length(this, $src);
4792     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4793   %}
4794   ins_pipe( pipe_slow );
4795 %}
4796 
4797 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4798   predicate(Matcher::vector_length(n->in(2)) == 8); // src
4799   match(Set dst (AddReductionVD dst src));
4800   match(Set dst (MulReductionVD dst src));
4801   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4802   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4803   ins_encode %{
4804     int opcode = this->ideal_Opcode();
4805     int vlen = Matcher::vector_length(this, $src);
4806     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4807   %}
4808   ins_pipe( pipe_slow );
4809 %}
4810 
4811 // =======================Byte Reduction==========================================
4812 
4813 #ifdef _LP64
4814 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4815   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
4816   match(Set dst (AddReductionVI src1 src2));
4817   match(Set dst (AndReductionV  src1 src2));
4818   match(Set dst ( OrReductionV  src1 src2));
4819   match(Set dst (XorReductionV  src1 src2));
4820   match(Set dst (MinReductionV  src1 src2));
4821   match(Set dst (MaxReductionV  src1 src2));
4822   effect(TEMP vtmp1, TEMP vtmp2);
4823   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4824   ins_encode %{
4825     int opcode = this->ideal_Opcode();
4826     int vlen = Matcher::vector_length(this, $src2);
4827     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4828   %}
4829   ins_pipe( pipe_slow );
4830 %}
4831 
4832 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4833   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
4834   match(Set dst (AddReductionVI src1 src2));
4835   match(Set dst (AndReductionV  src1 src2));
4836   match(Set dst ( OrReductionV  src1 src2));
4837   match(Set dst (XorReductionV  src1 src2));
4838   match(Set dst (MinReductionV  src1 src2));
4839   match(Set dst (MaxReductionV  src1 src2));
4840   effect(TEMP vtmp1, TEMP vtmp2);
4841   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4842   ins_encode %{
4843     int opcode = this->ideal_Opcode();
4844     int vlen = Matcher::vector_length(this, $src2);
4845     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4846   %}
4847   ins_pipe( pipe_slow );
4848 %}
4849 #endif
4850 
4851 // =======================Short Reduction==========================================
4852 
4853 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4854   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
4855   match(Set dst (AddReductionVI src1 src2));
4856   match(Set dst (MulReductionVI src1 src2));
4857   match(Set dst (AndReductionV  src1 src2));
4858   match(Set dst ( OrReductionV  src1 src2));
4859   match(Set dst (XorReductionV  src1 src2));
4860   match(Set dst (MinReductionV  src1 src2));
4861   match(Set dst (MaxReductionV  src1 src2));
4862   effect(TEMP vtmp1, TEMP vtmp2);
4863   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4864   ins_encode %{
4865     int opcode = this->ideal_Opcode();
4866     int vlen = Matcher::vector_length(this, $src2);
4867     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4868   %}
4869   ins_pipe( pipe_slow );
4870 %}
4871 
4872 // =======================Mul Reduction==========================================
4873 
4874 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4875   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
4876             Matcher::vector_length(n->in(2)) <= 32); // src2
4877   match(Set dst (MulReductionVI src1 src2));
4878   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4879   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4880   ins_encode %{
4881     int opcode = this->ideal_Opcode();
4882     int vlen = Matcher::vector_length(this, $src2);
4883     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4884   %}
4885   ins_pipe( pipe_slow );
4886 %}
4887 
4888 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4889   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
4890             Matcher::vector_length(n->in(2)) == 64); // src2
4891   match(Set dst (MulReductionVI src1 src2));
4892   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4893   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4894   ins_encode %{
4895     int opcode = this->ideal_Opcode();
4896     int vlen = Matcher::vector_length(this, $src2);
4897     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4898   %}
4899   ins_pipe( pipe_slow );
4900 %}
4901 
4902 //--------------------Min/Max Float Reduction --------------------
4903 // Float Min Reduction
4904 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
4905                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4906   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4907             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4908              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4909             Matcher::vector_length(n->in(2)) == 2);
4910   match(Set dst (MinReductionV src1 src2));
4911   match(Set dst (MaxReductionV src1 src2));
4912   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4913   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4914   ins_encode %{
4915     assert(UseAVX > 0, "sanity");
4916 
4917     int opcode = this->ideal_Opcode();
4918     int vlen = Matcher::vector_length(this, $src2);
4919     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4920                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4921   %}
4922   ins_pipe( pipe_slow );
4923 %}
4924 
4925 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
4926                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4927   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4928             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4929              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4930             Matcher::vector_length(n->in(2)) >= 4);
4931   match(Set dst (MinReductionV src1 src2));
4932   match(Set dst (MaxReductionV src1 src2));
4933   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4934   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4935   ins_encode %{
4936     assert(UseAVX > 0, "sanity");
4937 
4938     int opcode = this->ideal_Opcode();
4939     int vlen = Matcher::vector_length(this, $src2);
4940     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4941                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4942   %}
4943   ins_pipe( pipe_slow );
4944 %}
4945 
4946 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
4947                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4948   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4949             Matcher::vector_length(n->in(2)) == 2);
4950   match(Set dst (MinReductionV dst src));
4951   match(Set dst (MaxReductionV dst src));
4952   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4953   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4954   ins_encode %{
4955     assert(UseAVX > 0, "sanity");
4956 
4957     int opcode = this->ideal_Opcode();
4958     int vlen = Matcher::vector_length(this, $src);
4959     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4960                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4961   %}
4962   ins_pipe( pipe_slow );
4963 %}
4964 
4965 
4966 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
4967                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4968   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4969             Matcher::vector_length(n->in(2)) >= 4);
4970   match(Set dst (MinReductionV dst src));
4971   match(Set dst (MaxReductionV dst src));
4972   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4973   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4974   ins_encode %{
4975     assert(UseAVX > 0, "sanity");
4976 
4977     int opcode = this->ideal_Opcode();
4978     int vlen = Matcher::vector_length(this, $src);
4979     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4980                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4981   %}
4982   ins_pipe( pipe_slow );
4983 %}
4984 
4985 
4986 //--------------------Min Double Reduction --------------------
4987 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
4988                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
4989                             rFlagsReg cr) %{
4990   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4991             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
4992              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
4993             Matcher::vector_length(n->in(2)) == 2);
4994   match(Set dst (MinReductionV src1 src2));
4995   match(Set dst (MaxReductionV src1 src2));
4996   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
4997   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
4998   ins_encode %{
4999     assert(UseAVX > 0, "sanity");
5000 
5001     int opcode = this->ideal_Opcode();
5002     int vlen = Matcher::vector_length(this, $src2);
5003     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
5004                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
5005   %}
5006   ins_pipe( pipe_slow );
5007 %}
5008 
5009 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
5010                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
5011                            rFlagsReg cr) %{
5012   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5013             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
5014              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
5015             Matcher::vector_length(n->in(2)) >= 4);
5016   match(Set dst (MinReductionV src1 src2));
5017   match(Set dst (MaxReductionV src1 src2));
5018   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
5019   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
5020   ins_encode %{
5021     assert(UseAVX > 0, "sanity");
5022 
5023     int opcode = this->ideal_Opcode();
5024     int vlen = Matcher::vector_length(this, $src2);
5025     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
5026                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
5027   %}
5028   ins_pipe( pipe_slow );
5029 %}
5030 
5031 
5032 instruct minmax_reduction2D_av(legRegD dst, legVec src,
5033                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
5034                                rFlagsReg cr) %{
5035   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5036             Matcher::vector_length(n->in(2)) == 2);
5037   match(Set dst (MinReductionV dst src));
5038   match(Set dst (MaxReductionV dst src));
5039   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
5040   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
5041   ins_encode %{
5042     assert(UseAVX > 0, "sanity");
5043 
5044     int opcode = this->ideal_Opcode();
5045     int vlen = Matcher::vector_length(this, $src);
5046     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
5047                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
5048   %}
5049   ins_pipe( pipe_slow );
5050 %}
5051 
5052 instruct minmax_reductionD_av(legRegD dst, legVec src,
5053                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
5054                               rFlagsReg cr) %{
5055   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5056             Matcher::vector_length(n->in(2)) >= 4);
5057   match(Set dst (MinReductionV dst src));
5058   match(Set dst (MaxReductionV dst src));
5059   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
5060   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
5061   ins_encode %{
5062     assert(UseAVX > 0, "sanity");
5063 
5064     int opcode = this->ideal_Opcode();
5065     int vlen = Matcher::vector_length(this, $src);
5066     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
5067                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
5068   %}
5069   ins_pipe( pipe_slow );
5070 %}
5071 
5072 // ====================VECTOR ARITHMETIC=======================================
5073 
5074 // --------------------------------- ADD --------------------------------------
5075 
5076 // Bytes vector add
5077 instruct vaddB(vec dst, vec src) %{
5078   predicate(UseAVX == 0);
5079   match(Set dst (AddVB dst src));
5080   format %{ "paddb   $dst,$src\t! add packedB" %}
5081   ins_encode %{
5082     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5083   %}
5084   ins_pipe( pipe_slow );
5085 %}
5086 
5087 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
5088   predicate(UseAVX > 0);
5089   match(Set dst (AddVB src1 src2));
5090   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
5091   ins_encode %{
5092     int vlen_enc = vector_length_encoding(this);
5093     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5094   %}
5095   ins_pipe( pipe_slow );
5096 %}
5097 
5098 instruct vaddB_mem(vec dst, vec src, memory mem) %{
5099   predicate((UseAVX > 0) &&
5100             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5101   match(Set dst (AddVB src (LoadVector mem)));
5102   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
5103   ins_encode %{
5104     int vlen_enc = vector_length_encoding(this);
5105     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5106   %}
5107   ins_pipe( pipe_slow );
5108 %}
5109 
5110 // Shorts/Chars vector add
5111 instruct vaddS(vec dst, vec src) %{
5112   predicate(UseAVX == 0);
5113   match(Set dst (AddVS dst src));
5114   format %{ "paddw   $dst,$src\t! add packedS" %}
5115   ins_encode %{
5116     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5117   %}
5118   ins_pipe( pipe_slow );
5119 %}
5120 
5121 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
5122   predicate(UseAVX > 0);
5123   match(Set dst (AddVS src1 src2));
5124   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
5125   ins_encode %{
5126     int vlen_enc = vector_length_encoding(this);
5127     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5128   %}
5129   ins_pipe( pipe_slow );
5130 %}
5131 
5132 instruct vaddS_mem(vec dst, vec src, memory mem) %{
5133   predicate((UseAVX > 0) &&
5134             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5135   match(Set dst (AddVS src (LoadVector mem)));
5136   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
5137   ins_encode %{
5138     int vlen_enc = vector_length_encoding(this);
5139     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5140   %}
5141   ins_pipe( pipe_slow );
5142 %}
5143 
5144 // Integers vector add
5145 instruct vaddI(vec dst, vec src) %{
5146   predicate(UseAVX == 0);
5147   match(Set dst (AddVI dst src));
5148   format %{ "paddd   $dst,$src\t! add packedI" %}
5149   ins_encode %{
5150     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
5151   %}
5152   ins_pipe( pipe_slow );
5153 %}
5154 
5155 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
5156   predicate(UseAVX > 0);
5157   match(Set dst (AddVI src1 src2));
5158   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
5159   ins_encode %{
5160     int vlen_enc = vector_length_encoding(this);
5161     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5162   %}
5163   ins_pipe( pipe_slow );
5164 %}
5165 
5166 
5167 instruct vaddI_mem(vec dst, vec src, memory mem) %{
5168   predicate((UseAVX > 0) &&
5169             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5170   match(Set dst (AddVI src (LoadVector mem)));
5171   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
5172   ins_encode %{
5173     int vlen_enc = vector_length_encoding(this);
5174     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5175   %}
5176   ins_pipe( pipe_slow );
5177 %}
5178 
5179 // Longs vector add
5180 instruct vaddL(vec dst, vec src) %{
5181   predicate(UseAVX == 0);
5182   match(Set dst (AddVL dst src));
5183   format %{ "paddq   $dst,$src\t! add packedL" %}
5184   ins_encode %{
5185     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
5186   %}
5187   ins_pipe( pipe_slow );
5188 %}
5189 
5190 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
5191   predicate(UseAVX > 0);
5192   match(Set dst (AddVL src1 src2));
5193   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
5194   ins_encode %{
5195     int vlen_enc = vector_length_encoding(this);
5196     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5197   %}
5198   ins_pipe( pipe_slow );
5199 %}
5200 
5201 instruct vaddL_mem(vec dst, vec src, memory mem) %{
5202   predicate((UseAVX > 0) &&
5203             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5204   match(Set dst (AddVL src (LoadVector mem)));
5205   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
5206   ins_encode %{
5207     int vlen_enc = vector_length_encoding(this);
5208     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5209   %}
5210   ins_pipe( pipe_slow );
5211 %}
5212 
5213 // Floats vector add
5214 instruct vaddF(vec dst, vec src) %{
5215   predicate(UseAVX == 0);
5216   match(Set dst (AddVF dst src));
5217   format %{ "addps   $dst,$src\t! add packedF" %}
5218   ins_encode %{
5219     __ addps($dst$$XMMRegister, $src$$XMMRegister);
5220   %}
5221   ins_pipe( pipe_slow );
5222 %}
5223 
5224 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
5225   predicate(UseAVX > 0);
5226   match(Set dst (AddVF src1 src2));
5227   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
5228   ins_encode %{
5229     int vlen_enc = vector_length_encoding(this);
5230     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5231   %}
5232   ins_pipe( pipe_slow );
5233 %}
5234 
5235 instruct vaddF_mem(vec dst, vec src, memory mem) %{
5236   predicate((UseAVX > 0) &&
5237             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5238   match(Set dst (AddVF src (LoadVector mem)));
5239   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
5240   ins_encode %{
5241     int vlen_enc = vector_length_encoding(this);
5242     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5243   %}
5244   ins_pipe( pipe_slow );
5245 %}
5246 
5247 // Doubles vector add
5248 instruct vaddD(vec dst, vec src) %{
5249   predicate(UseAVX == 0);
5250   match(Set dst (AddVD dst src));
5251   format %{ "addpd   $dst,$src\t! add packedD" %}
5252   ins_encode %{
5253     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
5254   %}
5255   ins_pipe( pipe_slow );
5256 %}
5257 
5258 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
5259   predicate(UseAVX > 0);
5260   match(Set dst (AddVD src1 src2));
5261   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
5262   ins_encode %{
5263     int vlen_enc = vector_length_encoding(this);
5264     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5265   %}
5266   ins_pipe( pipe_slow );
5267 %}
5268 
5269 instruct vaddD_mem(vec dst, vec src, memory mem) %{
5270   predicate((UseAVX > 0) &&
5271             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5272   match(Set dst (AddVD src (LoadVector mem)));
5273   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
5274   ins_encode %{
5275     int vlen_enc = vector_length_encoding(this);
5276     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5277   %}
5278   ins_pipe( pipe_slow );
5279 %}
5280 
5281 // --------------------------------- SUB --------------------------------------
5282 
5283 // Bytes vector sub
5284 instruct vsubB(vec dst, vec src) %{
5285   predicate(UseAVX == 0);
5286   match(Set dst (SubVB dst src));
5287   format %{ "psubb   $dst,$src\t! sub packedB" %}
5288   ins_encode %{
5289     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
5290   %}
5291   ins_pipe( pipe_slow );
5292 %}
5293 
5294 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
5295   predicate(UseAVX > 0);
5296   match(Set dst (SubVB src1 src2));
5297   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
5298   ins_encode %{
5299     int vlen_enc = vector_length_encoding(this);
5300     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5301   %}
5302   ins_pipe( pipe_slow );
5303 %}
5304 
5305 instruct vsubB_mem(vec dst, vec src, memory mem) %{
5306   predicate((UseAVX > 0) &&
5307             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5308   match(Set dst (SubVB src (LoadVector mem)));
5309   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
5310   ins_encode %{
5311     int vlen_enc = vector_length_encoding(this);
5312     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5313   %}
5314   ins_pipe( pipe_slow );
5315 %}
5316 
5317 // Shorts/Chars vector sub
5318 instruct vsubS(vec dst, vec src) %{
5319   predicate(UseAVX == 0);
5320   match(Set dst (SubVS dst src));
5321   format %{ "psubw   $dst,$src\t! sub packedS" %}
5322   ins_encode %{
5323     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
5324   %}
5325   ins_pipe( pipe_slow );
5326 %}
5327 
5328 
5329 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
5330   predicate(UseAVX > 0);
5331   match(Set dst (SubVS src1 src2));
5332   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
5333   ins_encode %{
5334     int vlen_enc = vector_length_encoding(this);
5335     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5336   %}
5337   ins_pipe( pipe_slow );
5338 %}
5339 
5340 instruct vsubS_mem(vec dst, vec src, memory mem) %{
5341   predicate((UseAVX > 0) &&
5342             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5343   match(Set dst (SubVS src (LoadVector mem)));
5344   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
5345   ins_encode %{
5346     int vlen_enc = vector_length_encoding(this);
5347     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5348   %}
5349   ins_pipe( pipe_slow );
5350 %}
5351 
5352 // Integers vector sub
5353 instruct vsubI(vec dst, vec src) %{
5354   predicate(UseAVX == 0);
5355   match(Set dst (SubVI dst src));
5356   format %{ "psubd   $dst,$src\t! sub packedI" %}
5357   ins_encode %{
5358     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
5359   %}
5360   ins_pipe( pipe_slow );
5361 %}
5362 
5363 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
5364   predicate(UseAVX > 0);
5365   match(Set dst (SubVI src1 src2));
5366   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
5367   ins_encode %{
5368     int vlen_enc = vector_length_encoding(this);
5369     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5370   %}
5371   ins_pipe( pipe_slow );
5372 %}
5373 
5374 instruct vsubI_mem(vec dst, vec src, memory mem) %{
5375   predicate((UseAVX > 0) &&
5376             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5377   match(Set dst (SubVI src (LoadVector mem)));
5378   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
5379   ins_encode %{
5380     int vlen_enc = vector_length_encoding(this);
5381     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5382   %}
5383   ins_pipe( pipe_slow );
5384 %}
5385 
5386 // Longs vector sub
5387 instruct vsubL(vec dst, vec src) %{
5388   predicate(UseAVX == 0);
5389   match(Set dst (SubVL dst src));
5390   format %{ "psubq   $dst,$src\t! sub packedL" %}
5391   ins_encode %{
5392     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
5393   %}
5394   ins_pipe( pipe_slow );
5395 %}
5396 
5397 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
5398   predicate(UseAVX > 0);
5399   match(Set dst (SubVL src1 src2));
5400   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
5401   ins_encode %{
5402     int vlen_enc = vector_length_encoding(this);
5403     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5404   %}
5405   ins_pipe( pipe_slow );
5406 %}
5407 
5408 
5409 instruct vsubL_mem(vec dst, vec src, memory mem) %{
5410   predicate((UseAVX > 0) &&
5411             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5412   match(Set dst (SubVL src (LoadVector mem)));
5413   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
5414   ins_encode %{
5415     int vlen_enc = vector_length_encoding(this);
5416     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5417   %}
5418   ins_pipe( pipe_slow );
5419 %}
5420 
5421 // Floats vector sub
5422 instruct vsubF(vec dst, vec src) %{
5423   predicate(UseAVX == 0);
5424   match(Set dst (SubVF dst src));
5425   format %{ "subps   $dst,$src\t! sub packedF" %}
5426   ins_encode %{
5427     __ subps($dst$$XMMRegister, $src$$XMMRegister);
5428   %}
5429   ins_pipe( pipe_slow );
5430 %}
5431 
5432 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
5433   predicate(UseAVX > 0);
5434   match(Set dst (SubVF src1 src2));
5435   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
5436   ins_encode %{
5437     int vlen_enc = vector_length_encoding(this);
5438     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5439   %}
5440   ins_pipe( pipe_slow );
5441 %}
5442 
5443 instruct vsubF_mem(vec dst, vec src, memory mem) %{
5444   predicate((UseAVX > 0) &&
5445             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5446   match(Set dst (SubVF src (LoadVector mem)));
5447   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
5448   ins_encode %{
5449     int vlen_enc = vector_length_encoding(this);
5450     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5451   %}
5452   ins_pipe( pipe_slow );
5453 %}
5454 
5455 // Doubles vector sub
5456 instruct vsubD(vec dst, vec src) %{
5457   predicate(UseAVX == 0);
5458   match(Set dst (SubVD dst src));
5459   format %{ "subpd   $dst,$src\t! sub packedD" %}
5460   ins_encode %{
5461     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
5462   %}
5463   ins_pipe( pipe_slow );
5464 %}
5465 
5466 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
5467   predicate(UseAVX > 0);
5468   match(Set dst (SubVD src1 src2));
5469   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
5470   ins_encode %{
5471     int vlen_enc = vector_length_encoding(this);
5472     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5473   %}
5474   ins_pipe( pipe_slow );
5475 %}
5476 
5477 instruct vsubD_mem(vec dst, vec src, memory mem) %{
5478   predicate((UseAVX > 0) &&
5479             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5480   match(Set dst (SubVD src (LoadVector mem)));
5481   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
5482   ins_encode %{
5483     int vlen_enc = vector_length_encoding(this);
5484     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5485   %}
5486   ins_pipe( pipe_slow );
5487 %}
5488 
5489 // --------------------------------- MUL --------------------------------------
5490 
5491 // Byte vector mul
5492 instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5493   predicate(Matcher::vector_length(n) == 4 ||
5494             Matcher::vector_length(n) == 8);
5495   match(Set dst (MulVB src1 src2));
5496   effect(TEMP dst, TEMP tmp, TEMP scratch);
5497   format %{"vector_mulB $dst,$src1,$src2" %}
5498   ins_encode %{
5499     assert(UseSSE > 3, "required");
5500     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
5501     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
5502     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
5503     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5504     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5505     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5506   %}
5507   ins_pipe( pipe_slow );
5508 %}
5509 
5510 instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5511   predicate(Matcher::vector_length(n) == 16 && UseAVX <= 1);
5512   match(Set dst (MulVB src1 src2));
5513   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5514   format %{"vector_mulB $dst,$src1,$src2" %}
5515   ins_encode %{
5516     assert(UseSSE > 3, "required");
5517     __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
5518     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
5519     __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
5520     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
5521     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
5522     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
5523     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
5524     __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
5525     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5526     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5527     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5528     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5529   %}
5530   ins_pipe( pipe_slow );
5531 %}
5532 
5533 instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5534   predicate(Matcher::vector_length(n) == 16 && UseAVX > 1);
5535   match(Set dst (MulVB src1 src2));
5536   effect(TEMP dst, TEMP tmp, TEMP scratch);
5537   format %{"vector_mulB $dst,$src1,$src2" %}
5538   ins_encode %{
5539   int vlen_enc = Assembler::AVX_256bit;
5540     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5541     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5542     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5543     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5544     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5545     __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
5546     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
5547   %}
5548   ins_pipe( pipe_slow );
5549 %}
5550 
5551 instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5552   predicate(Matcher::vector_length(n) == 32);
5553   match(Set dst (MulVB src1 src2));
5554   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5555   format %{"vector_mulB $dst,$src1,$src2" %}
5556   ins_encode %{
5557     assert(UseAVX > 1, "required");
5558     int vlen_enc = Assembler::AVX_256bit;
5559     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5560     __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
5561     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5562     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5563     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5564     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5565     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5566     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5567     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5568     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5569     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5570     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5571     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5572     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
5573   %}
5574   ins_pipe( pipe_slow );
5575 %}
5576 
5577 instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5578   predicate(Matcher::vector_length(n) == 64);
5579   match(Set dst (MulVB src1 src2));
5580   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5581   format %{"vector_mulB $dst,$src1,$src2\n\t" %}
5582   ins_encode %{
5583     assert(UseAVX > 2, "required");
5584     int vlen_enc = Assembler::AVX_512bit;
5585     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5586     __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
5587     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5588     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5589     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5590     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5591     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5592     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5593     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5594     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5595     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5596     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5597     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5598     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
5599     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5600   %}
5601   ins_pipe( pipe_slow );
5602 %}
5603 
5604 // Shorts/Chars vector mul
5605 instruct vmulS(vec dst, vec src) %{
5606   predicate(UseAVX == 0);
5607   match(Set dst (MulVS dst src));
5608   format %{ "pmullw $dst,$src\t! mul packedS" %}
5609   ins_encode %{
5610     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
5611   %}
5612   ins_pipe( pipe_slow );
5613 %}
5614 
5615 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
5616   predicate(UseAVX > 0);
5617   match(Set dst (MulVS src1 src2));
5618   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
5619   ins_encode %{
5620     int vlen_enc = vector_length_encoding(this);
5621     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5622   %}
5623   ins_pipe( pipe_slow );
5624 %}
5625 
5626 instruct vmulS_mem(vec dst, vec src, memory mem) %{
5627   predicate((UseAVX > 0) &&
5628             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5629   match(Set dst (MulVS src (LoadVector mem)));
5630   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
5631   ins_encode %{
5632     int vlen_enc = vector_length_encoding(this);
5633     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5634   %}
5635   ins_pipe( pipe_slow );
5636 %}
5637 
5638 // Integers vector mul
5639 instruct vmulI(vec dst, vec src) %{
5640   predicate(UseAVX == 0);
5641   match(Set dst (MulVI dst src));
5642   format %{ "pmulld  $dst,$src\t! mul packedI" %}
5643   ins_encode %{
5644     assert(UseSSE > 3, "required");
5645     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
5646   %}
5647   ins_pipe( pipe_slow );
5648 %}
5649 
5650 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
5651   predicate(UseAVX > 0);
5652   match(Set dst (MulVI src1 src2));
5653   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
5654   ins_encode %{
5655     int vlen_enc = vector_length_encoding(this);
5656     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5657   %}
5658   ins_pipe( pipe_slow );
5659 %}
5660 
5661 instruct vmulI_mem(vec dst, vec src, memory mem) %{
5662   predicate((UseAVX > 0) &&
5663             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5664   match(Set dst (MulVI src (LoadVector mem)));
5665   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
5666   ins_encode %{
5667     int vlen_enc = vector_length_encoding(this);
5668     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5669   %}
5670   ins_pipe( pipe_slow );
5671 %}
5672 
5673 // Longs vector mul
5674 instruct vmulL_reg(vec dst, vec src1, vec src2) %{
5675   predicate(VM_Version::supports_avx512dq());
5676   match(Set dst (MulVL src1 src2));
5677   format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
5678   ins_encode %{
5679     assert(UseAVX > 2, "required");
5680     int vlen_enc = vector_length_encoding(this);
5681     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5682   %}
5683   ins_pipe( pipe_slow );
5684 %}
5685 
5686 instruct vmulL_mem(vec dst, vec src, memory mem) %{
5687   predicate(VM_Version::supports_avx512dq() &&
5688               (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5689   match(Set dst (MulVL src (LoadVector mem)));
5690   format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
5691   ins_encode %{
5692     assert(UseAVX > 2, "required");
5693     int vlen_enc = vector_length_encoding(this);
5694     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5695   %}
5696   ins_pipe( pipe_slow );
5697 %}
5698 
5699 instruct mul2L_reg(vec dst, vec src2, legVec tmp) %{
5700   predicate(Matcher::vector_length(n) == 2 && !VM_Version::supports_avx512dq());
5701   match(Set dst (MulVL dst src2));
5702   effect(TEMP dst, TEMP tmp);
5703   format %{ "pshufd $tmp,$src2, 177\n\t"
5704             "pmulld $tmp,$dst\n\t"
5705             "phaddd $tmp,$tmp\n\t"
5706             "pmovzxdq $tmp,$tmp\n\t"
5707             "psllq $tmp, 32\n\t"
5708             "pmuludq $dst,$src2\n\t"
5709             "paddq $dst,$tmp\n\t! mul packed2L" %}
5710 
5711   ins_encode %{
5712     assert(VM_Version::supports_sse4_1(), "required");
5713     int vlen_enc = Assembler::AVX_128bit;
5714     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
5715     __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
5716     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5717     __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
5718     __ psllq($tmp$$XMMRegister, 32);
5719     __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
5720     __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
5721   %}
5722   ins_pipe( pipe_slow );
5723 %}
5724 
5725 instruct vmul4L_reg_avx(vec dst, vec src1, vec src2, legVec tmp, legVec tmp1) %{
5726   predicate(Matcher::vector_length(n) == 4 && !VM_Version::supports_avx512dq());
5727   match(Set dst (MulVL src1 src2));
5728   effect(TEMP tmp1, TEMP tmp);
5729   format %{ "vpshufd $tmp,$src2\n\t"
5730             "vpmulld $tmp,$src1,$tmp\n\t"
5731             "vphaddd $tmp,$tmp,$tmp\n\t"
5732             "vpmovzxdq $tmp,$tmp\n\t"
5733             "vpsllq $tmp,$tmp\n\t"
5734             "vpmuludq $tmp1,$src1,$src2\n\t"
5735             "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
5736   ins_encode %{
5737     int vlen_enc = Assembler::AVX_256bit;
5738     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vlen_enc);
5739     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5740     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
5741     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5742     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5743     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vlen_enc);
5744     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5745     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5746   %}
5747   ins_pipe( pipe_slow );
5748 %}
5749 
5750 // Floats vector mul
5751 instruct vmulF(vec dst, vec src) %{
5752   predicate(UseAVX == 0);
5753   match(Set dst (MulVF dst src));
5754   format %{ "mulps   $dst,$src\t! mul packedF" %}
5755   ins_encode %{
5756     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
5757   %}
5758   ins_pipe( pipe_slow );
5759 %}
5760 
5761 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
5762   predicate(UseAVX > 0);
5763   match(Set dst (MulVF src1 src2));
5764   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
5765   ins_encode %{
5766     int vlen_enc = vector_length_encoding(this);
5767     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5768   %}
5769   ins_pipe( pipe_slow );
5770 %}
5771 
5772 instruct vmulF_mem(vec dst, vec src, memory mem) %{
5773   predicate((UseAVX > 0) &&
5774             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5775   match(Set dst (MulVF src (LoadVector mem)));
5776   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
5777   ins_encode %{
5778     int vlen_enc = vector_length_encoding(this);
5779     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5780   %}
5781   ins_pipe( pipe_slow );
5782 %}
5783 
5784 // Doubles vector mul
5785 instruct vmulD(vec dst, vec src) %{
5786   predicate(UseAVX == 0);
5787   match(Set dst (MulVD dst src));
5788   format %{ "mulpd   $dst,$src\t! mul packedD" %}
5789   ins_encode %{
5790     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
5791   %}
5792   ins_pipe( pipe_slow );
5793 %}
5794 
5795 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
5796   predicate(UseAVX > 0);
5797   match(Set dst (MulVD src1 src2));
5798   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
5799   ins_encode %{
5800     int vlen_enc = vector_length_encoding(this);
5801     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5802   %}
5803   ins_pipe( pipe_slow );
5804 %}
5805 
5806 instruct vmulD_mem(vec dst, vec src, memory mem) %{
5807   predicate((UseAVX > 0) &&
5808             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5809   match(Set dst (MulVD src (LoadVector mem)));
5810   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
5811   ins_encode %{
5812     int vlen_enc = vector_length_encoding(this);
5813     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5814   %}
5815   ins_pipe( pipe_slow );
5816 %}
5817 
5818 instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5819   predicate(Matcher::vector_length(n) == 8);
5820   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
5821   effect(TEMP dst, USE src1, USE src2);
5822   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
5823             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
5824          %}
5825   ins_encode %{
5826     assert(UseAVX > 0, "required");
5827 
5828     int vlen_enc = Assembler::AVX_256bit;
5829     int cond = (Assembler::Condition)($copnd$$cmpcode);
5830     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5831     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5832   %}
5833   ins_pipe( pipe_slow );
5834 %}
5835 
5836 instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5837   predicate(Matcher::vector_length(n) == 4);
5838   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
5839   effect(TEMP dst, USE src1, USE src2);
5840   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
5841             "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
5842          %}
5843   ins_encode %{
5844     assert(UseAVX > 0, "required");
5845 
5846     int vlen_enc = Assembler::AVX_256bit;
5847     int cond = (Assembler::Condition)($copnd$$cmpcode);
5848     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5849     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5850   %}
5851   ins_pipe( pipe_slow );
5852 %}
5853 
5854 // --------------------------------- DIV --------------------------------------
5855 
5856 // Floats vector div
5857 instruct vdivF(vec dst, vec src) %{
5858   predicate(UseAVX == 0);
5859   match(Set dst (DivVF dst src));
5860   format %{ "divps   $dst,$src\t! div packedF" %}
5861   ins_encode %{
5862     __ divps($dst$$XMMRegister, $src$$XMMRegister);
5863   %}
5864   ins_pipe( pipe_slow );
5865 %}
5866 
5867 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
5868   predicate(UseAVX > 0);
5869   match(Set dst (DivVF src1 src2));
5870   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
5871   ins_encode %{
5872     int vlen_enc = vector_length_encoding(this);
5873     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5874   %}
5875   ins_pipe( pipe_slow );
5876 %}
5877 
5878 instruct vdivF_mem(vec dst, vec src, memory mem) %{
5879   predicate((UseAVX > 0) &&
5880             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5881   match(Set dst (DivVF src (LoadVector mem)));
5882   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
5883   ins_encode %{
5884     int vlen_enc = vector_length_encoding(this);
5885     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5886   %}
5887   ins_pipe( pipe_slow );
5888 %}
5889 
5890 // Doubles vector div
5891 instruct vdivD(vec dst, vec src) %{
5892   predicate(UseAVX == 0);
5893   match(Set dst (DivVD dst src));
5894   format %{ "divpd   $dst,$src\t! div packedD" %}
5895   ins_encode %{
5896     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
5897   %}
5898   ins_pipe( pipe_slow );
5899 %}
5900 
5901 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
5902   predicate(UseAVX > 0);
5903   match(Set dst (DivVD src1 src2));
5904   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
5905   ins_encode %{
5906     int vlen_enc = vector_length_encoding(this);
5907     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5908   %}
5909   ins_pipe( pipe_slow );
5910 %}
5911 
5912 instruct vdivD_mem(vec dst, vec src, memory mem) %{
5913   predicate((UseAVX > 0) &&
5914             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5915   match(Set dst (DivVD src (LoadVector mem)));
5916   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
5917   ins_encode %{
5918     int vlen_enc = vector_length_encoding(this);
5919     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5920   %}
5921   ins_pipe( pipe_slow );
5922 %}
5923 
5924 // ------------------------------ MinMax ---------------------------------------
5925 
5926 // Byte, Short, Int vector Min/Max
5927 instruct minmax_reg_sse(vec dst, vec src) %{
5928   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5929             UseAVX == 0);
5930   match(Set dst (MinV dst src));
5931   match(Set dst (MaxV dst src));
5932   format %{ "vector_minmax  $dst,$src\t!  " %}
5933   ins_encode %{
5934     assert(UseSSE >= 4, "required");
5935 
5936     int opcode = this->ideal_Opcode();
5937     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5938     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
5939   %}
5940   ins_pipe( pipe_slow );
5941 %}
5942 
5943 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
5944   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5945             UseAVX > 0);
5946   match(Set dst (MinV src1 src2));
5947   match(Set dst (MaxV src1 src2));
5948   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
5949   ins_encode %{
5950     int opcode = this->ideal_Opcode();
5951     int vlen_enc = vector_length_encoding(this);
5952     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5953 
5954     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5955   %}
5956   ins_pipe( pipe_slow );
5957 %}
5958 
5959 // Long vector Min/Max
5960 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
5961   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
5962             UseAVX == 0);
5963   match(Set dst (MinV dst src));
5964   match(Set dst (MaxV src dst));
5965   effect(TEMP dst, TEMP tmp);
5966   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
5967   ins_encode %{
5968     assert(UseSSE >= 4, "required");
5969 
5970     int opcode = this->ideal_Opcode();
5971     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5972     assert(elem_bt == T_LONG, "sanity");
5973 
5974     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
5975   %}
5976   ins_pipe( pipe_slow );
5977 %}
5978 
5979 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
5980   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
5981             UseAVX > 0 && !VM_Version::supports_avx512vl());
5982   match(Set dst (MinV src1 src2));
5983   match(Set dst (MaxV src1 src2));
5984   effect(TEMP dst);
5985   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
5986   ins_encode %{
5987     int vlen_enc = vector_length_encoding(this);
5988     int opcode = this->ideal_Opcode();
5989     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5990     assert(elem_bt == T_LONG, "sanity");
5991 
5992     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5993   %}
5994   ins_pipe( pipe_slow );
5995 %}
5996 
5997 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
5998   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
5999             Matcher::vector_element_basic_type(n) == T_LONG);
6000   match(Set dst (MinV src1 src2));
6001   match(Set dst (MaxV src1 src2));
6002   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
6003   ins_encode %{
6004     assert(UseAVX > 2, "required");
6005 
6006     int vlen_enc = vector_length_encoding(this);
6007     int opcode = this->ideal_Opcode();
6008     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6009     assert(elem_bt == T_LONG, "sanity");
6010 
6011     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6012   %}
6013   ins_pipe( pipe_slow );
6014 %}
6015 
6016 // Float/Double vector Min/Max
6017 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
6018   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
6019             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
6020             UseAVX > 0);
6021   match(Set dst (MinV a b));
6022   match(Set dst (MaxV a b));
6023   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
6024   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
6025   ins_encode %{
6026     assert(UseAVX > 0, "required");
6027 
6028     int opcode = this->ideal_Opcode();
6029     int vlen_enc = vector_length_encoding(this);
6030     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6031 
6032     __ vminmax_fp(opcode, elem_bt,
6033                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
6034                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
6035   %}
6036   ins_pipe( pipe_slow );
6037 %}
6038 
6039 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
6040   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
6041             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
6042   match(Set dst (MinV a b));
6043   match(Set dst (MaxV a b));
6044   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
6045   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
6046   ins_encode %{
6047     assert(UseAVX > 2, "required");
6048 
6049     int opcode = this->ideal_Opcode();
6050     int vlen_enc = vector_length_encoding(this);
6051     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6052 
6053     __ evminmax_fp(opcode, elem_bt,
6054                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
6055                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
6056   %}
6057   ins_pipe( pipe_slow );
6058 %}
6059 
6060 // --------------------------------- Signum/CopySign ---------------------------
6061 
6062 instruct signumF_reg(regF dst, regF zero, regF one, rRegP scratch, rFlagsReg cr) %{
6063   match(Set dst (SignumF dst (Binary zero one)));
6064   effect(TEMP scratch, KILL cr);
6065   format %{ "signumF $dst, $dst\t! using $scratch as TEMP" %}
6066   ins_encode %{
6067     int opcode = this->ideal_Opcode();
6068     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
6069   %}
6070   ins_pipe( pipe_slow );
6071 %}
6072 
6073 instruct signumD_reg(regD dst, regD zero, regD one, rRegP scratch, rFlagsReg cr) %{
6074   match(Set dst (SignumD dst (Binary zero one)));
6075   effect(TEMP scratch, KILL cr);
6076   format %{ "signumD $dst, $dst\t! using $scratch as TEMP" %}
6077   ins_encode %{
6078     int opcode = this->ideal_Opcode();
6079     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
6080   %}
6081   ins_pipe( pipe_slow );
6082 %}
6083 
6084 // ---------------------------------------
6085 // For copySign use 0xE4 as writemask for vpternlog
6086 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
6087 // C (xmm2) is set to 0x7FFFFFFF
6088 // Wherever xmm2 is 0, we want to pick from B (sign)
6089 // Wherever xmm2 is 1, we want to pick from A (src)
6090 //
6091 // A B C Result
6092 // 0 0 0 0
6093 // 0 0 1 0
6094 // 0 1 0 1
6095 // 0 1 1 0
6096 // 1 0 0 0
6097 // 1 0 1 1
6098 // 1 1 0 1
6099 // 1 1 1 1
6100 //
6101 // Result going from high bit to low bit is 0x11100100 = 0xe4
6102 // ---------------------------------------
6103 
6104 #ifdef _LP64
6105 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
6106   match(Set dst (CopySignF dst src));
6107   effect(TEMP tmp1, TEMP tmp2);
6108   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
6109   ins_encode %{
6110     __ movl($tmp2$$Register, 0x7FFFFFFF);
6111     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
6112     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
6113   %}
6114   ins_pipe( pipe_slow );
6115 %}
6116 
6117 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
6118   match(Set dst (CopySignD dst (Binary src zero)));
6119   ins_cost(100);
6120   effect(TEMP tmp1, TEMP tmp2);
6121   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
6122   ins_encode %{
6123     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
6124     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
6125     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
6126   %}
6127   ins_pipe( pipe_slow );
6128 %}
6129 #endif // _LP64
6130 
6131 // --------------------------------- Sqrt --------------------------------------
6132 
6133 instruct vsqrtF_reg(vec dst, vec src) %{
6134   match(Set dst (SqrtVF src));
6135   ins_cost(400);
6136   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
6137   ins_encode %{
6138     assert(UseAVX > 0, "required");
6139     int vlen_enc = vector_length_encoding(this);
6140     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6141   %}
6142   ins_pipe( pipe_slow );
6143 %}
6144 
6145 instruct vsqrtF_mem(vec dst, memory mem) %{
6146   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
6147   match(Set dst (SqrtVF (LoadVector mem)));
6148   ins_cost(400);
6149   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
6150   ins_encode %{
6151     assert(UseAVX > 0, "required");
6152     int vlen_enc = vector_length_encoding(this);
6153     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
6154   %}
6155   ins_pipe( pipe_slow );
6156 %}
6157 
6158 // Floating point vector sqrt
6159 instruct vsqrtD_reg(vec dst, vec src) %{
6160   match(Set dst (SqrtVD src));
6161   ins_cost(400);
6162   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
6163   ins_encode %{
6164     assert(UseAVX > 0, "required");
6165     int vlen_enc = vector_length_encoding(this);
6166     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6167   %}
6168   ins_pipe( pipe_slow );
6169 %}
6170 
6171 instruct vsqrtD_mem(vec dst, memory mem) %{
6172   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
6173   match(Set dst (SqrtVD (LoadVector mem)));
6174   ins_cost(400);
6175   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
6176   ins_encode %{
6177     assert(UseAVX > 0, "required");
6178     int vlen_enc = vector_length_encoding(this);
6179     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
6180   %}
6181   ins_pipe( pipe_slow );
6182 %}
6183 
6184 // ------------------------------ Shift ---------------------------------------
6185 
6186 // Left and right shift count vectors are the same on x86
6187 // (only lowest bits of xmm reg are used for count).
6188 instruct vshiftcnt(vec dst, rRegI cnt) %{
6189   match(Set dst (LShiftCntV cnt));
6190   match(Set dst (RShiftCntV cnt));
6191   format %{ "movdl    $dst,$cnt\t! load shift count" %}
6192   ins_encode %{
6193     __ movdl($dst$$XMMRegister, $cnt$$Register);
6194   %}
6195   ins_pipe( pipe_slow );
6196 %}
6197 
6198 // Byte vector shift
6199 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6200   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
6201   match(Set dst ( LShiftVB src shift));
6202   match(Set dst ( RShiftVB src shift));
6203   match(Set dst (URShiftVB src shift));
6204   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
6205   format %{"vector_byte_shift $dst,$src,$shift" %}
6206   ins_encode %{
6207     assert(UseSSE > 3, "required");
6208     int opcode = this->ideal_Opcode();
6209     bool sign = (opcode != Op_URShiftVB);
6210     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
6211     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
6212     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6213     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
6214     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
6215   %}
6216   ins_pipe( pipe_slow );
6217 %}
6218 
6219 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
6220   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
6221             UseAVX <= 1);
6222   match(Set dst ( LShiftVB src shift));
6223   match(Set dst ( RShiftVB src shift));
6224   match(Set dst (URShiftVB src shift));
6225   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
6226   format %{"vector_byte_shift $dst,$src,$shift" %}
6227   ins_encode %{
6228     assert(UseSSE > 3, "required");
6229     int opcode = this->ideal_Opcode();
6230     bool sign = (opcode != Op_URShiftVB);
6231     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
6232     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
6233     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
6234     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
6235     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
6236     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6237     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
6238     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
6239     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
6240   %}
6241   ins_pipe( pipe_slow );
6242 %}
6243 
6244 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6245   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
6246             UseAVX > 1);
6247   match(Set dst ( LShiftVB src shift));
6248   match(Set dst ( RShiftVB src shift));
6249   match(Set dst (URShiftVB src shift));
6250   effect(TEMP dst, TEMP tmp, TEMP scratch);
6251   format %{"vector_byte_shift $dst,$src,$shift" %}
6252   ins_encode %{
6253     int opcode = this->ideal_Opcode();
6254     bool sign = (opcode != Op_URShiftVB);
6255     int vlen_enc = Assembler::AVX_256bit;
6256     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
6257     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6258     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6259     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
6260     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
6261   %}
6262   ins_pipe( pipe_slow );
6263 %}
6264 
6265 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6266   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
6267   match(Set dst ( LShiftVB src shift));
6268   match(Set dst ( RShiftVB src shift));
6269   match(Set dst (URShiftVB src shift));
6270   effect(TEMP dst, TEMP tmp, TEMP scratch);
6271   format %{"vector_byte_shift $dst,$src,$shift" %}
6272   ins_encode %{
6273     assert(UseAVX > 1, "required");
6274     int opcode = this->ideal_Opcode();
6275     bool sign = (opcode != Op_URShiftVB);
6276     int vlen_enc = Assembler::AVX_256bit;
6277     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
6278     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6279     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6280     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6281     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6282     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6283     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6284     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6285     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6286   %}
6287   ins_pipe( pipe_slow );
6288 %}
6289 
6290 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
6291   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
6292   match(Set dst ( LShiftVB src shift));
6293   match(Set dst  (RShiftVB src shift));
6294   match(Set dst (URShiftVB src shift));
6295   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
6296   format %{"vector_byte_shift $dst,$src,$shift" %}
6297   ins_encode %{
6298     assert(UseAVX > 2, "required");
6299     int opcode = this->ideal_Opcode();
6300     bool sign = (opcode != Op_URShiftVB);
6301     int vlen_enc = Assembler::AVX_512bit;
6302     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
6303     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
6304     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6305     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6306     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6307     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6308     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6309     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6310     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6311     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
6312     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
6313     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6314   %}
6315   ins_pipe( pipe_slow );
6316 %}
6317 
6318 // Shorts vector logical right shift produces incorrect Java result
6319 // for negative data because java code convert short value into int with
6320 // sign extension before a shift. But char vectors are fine since chars are
6321 // unsigned values.
6322 // Shorts/Chars vector left shift
6323 instruct vshiftS(vec dst, vec src, vec shift) %{
6324   predicate(!n->as_ShiftV()->is_var_shift());
6325   match(Set dst ( LShiftVS src shift));
6326   match(Set dst ( RShiftVS src shift));
6327   match(Set dst (URShiftVS src shift));
6328   effect(TEMP dst, USE src, USE shift);
6329   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
6330   ins_encode %{
6331     int opcode = this->ideal_Opcode();
6332     if (UseAVX > 0) {
6333       int vlen_enc = vector_length_encoding(this);
6334       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6335     } else {
6336       int vlen = Matcher::vector_length(this);
6337       if (vlen == 2) {
6338         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6339         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6340       } else if (vlen == 4) {
6341         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6342         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6343       } else {
6344         assert (vlen == 8, "sanity");
6345         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6346         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6347       }
6348     }
6349   %}
6350   ins_pipe( pipe_slow );
6351 %}
6352 
6353 // Integers vector left shift
6354 instruct vshiftI(vec dst, vec src, vec shift) %{
6355   predicate(!n->as_ShiftV()->is_var_shift());
6356   match(Set dst ( LShiftVI src shift));
6357   match(Set dst ( RShiftVI src shift));
6358   match(Set dst (URShiftVI src shift));
6359   effect(TEMP dst, USE src, USE shift);
6360   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
6361   ins_encode %{
6362     int opcode = this->ideal_Opcode();
6363     if (UseAVX > 0) {
6364       int vlen_enc = vector_length_encoding(this);
6365       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6366     } else {
6367       int vlen = Matcher::vector_length(this);
6368       if (vlen == 2) {
6369         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6370         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6371       } else {
6372         assert(vlen == 4, "sanity");
6373         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6374         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6375       }
6376     }
6377   %}
6378   ins_pipe( pipe_slow );
6379 %}
6380 
6381 // Integers vector left constant shift
6382 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
6383   match(Set dst (LShiftVI src (LShiftCntV shift)));
6384   match(Set dst (RShiftVI src (RShiftCntV shift)));
6385   match(Set dst (URShiftVI src (RShiftCntV shift)));
6386   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
6387   ins_encode %{
6388     int opcode = this->ideal_Opcode();
6389     if (UseAVX > 0) {
6390       int vector_len = vector_length_encoding(this);
6391       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6392     } else {
6393       int vlen = Matcher::vector_length(this);
6394       if (vlen == 2) {
6395         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6396         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6397       } else {
6398         assert(vlen == 4, "sanity");
6399         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6400         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6401       }
6402     }
6403   %}
6404   ins_pipe( pipe_slow );
6405 %}
6406 
6407 // Longs vector shift
6408 instruct vshiftL(vec dst, vec src, vec shift) %{
6409   predicate(!n->as_ShiftV()->is_var_shift());
6410   match(Set dst ( LShiftVL src shift));
6411   match(Set dst (URShiftVL src shift));
6412   effect(TEMP dst, USE src, USE shift);
6413   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
6414   ins_encode %{
6415     int opcode = this->ideal_Opcode();
6416     if (UseAVX > 0) {
6417       int vlen_enc = vector_length_encoding(this);
6418       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6419     } else {
6420       assert(Matcher::vector_length(this) == 2, "");
6421       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6422       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6423     }
6424   %}
6425   ins_pipe( pipe_slow );
6426 %}
6427 
6428 // Longs vector constant shift
6429 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
6430   match(Set dst (LShiftVL src (LShiftCntV shift)));
6431   match(Set dst (URShiftVL src (RShiftCntV shift)));
6432   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
6433   ins_encode %{
6434     int opcode = this->ideal_Opcode();
6435     if (UseAVX > 0) {
6436       int vector_len = vector_length_encoding(this);
6437       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6438     } else {
6439       assert(Matcher::vector_length(this) == 2, "");
6440       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6441       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6442     }
6443   %}
6444   ins_pipe( pipe_slow );
6445 %}
6446 
6447 // -------------------ArithmeticRightShift -----------------------------------
6448 // Long vector arithmetic right shift
6449 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6450   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
6451   match(Set dst (RShiftVL src shift));
6452   effect(TEMP dst, TEMP tmp, TEMP scratch);
6453   format %{ "vshiftq $dst,$src,$shift" %}
6454   ins_encode %{
6455     uint vlen = Matcher::vector_length(this);
6456     if (vlen == 2) {
6457       assert(UseSSE >= 2, "required");
6458       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6459       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
6460       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6461       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
6462       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
6463       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
6464     } else {
6465       assert(vlen == 4, "sanity");
6466       assert(UseAVX > 1, "required");
6467       int vlen_enc = Assembler::AVX_256bit;
6468       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6469       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6470       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6471       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6472       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6473     }
6474   %}
6475   ins_pipe( pipe_slow );
6476 %}
6477 
6478 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
6479   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
6480   match(Set dst (RShiftVL src shift));
6481   format %{ "vshiftq $dst,$src,$shift" %}
6482   ins_encode %{
6483     int vlen_enc = vector_length_encoding(this);
6484     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6485   %}
6486   ins_pipe( pipe_slow );
6487 %}
6488 
6489 // ------------------- Variable Shift -----------------------------
6490 // Byte variable shift
6491 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6492   predicate(Matcher::vector_length(n) <= 8 &&
6493             n->as_ShiftV()->is_var_shift() &&
6494             !VM_Version::supports_avx512bw());
6495   match(Set dst ( LShiftVB src shift));
6496   match(Set dst ( RShiftVB src shift));
6497   match(Set dst (URShiftVB src shift));
6498   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6499   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6500   ins_encode %{
6501     assert(UseAVX >= 2, "required");
6502 
6503     int opcode = this->ideal_Opcode();
6504     int vlen_enc = Assembler::AVX_128bit;
6505     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6506     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6507   %}
6508   ins_pipe( pipe_slow );
6509 %}
6510 
6511 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6512   predicate(Matcher::vector_length(n) == 16 &&
6513             n->as_ShiftV()->is_var_shift() &&
6514             !VM_Version::supports_avx512bw());
6515   match(Set dst ( LShiftVB src shift));
6516   match(Set dst ( RShiftVB src shift));
6517   match(Set dst (URShiftVB src shift));
6518   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6519   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6520   ins_encode %{
6521     assert(UseAVX >= 2, "required");
6522 
6523     int opcode = this->ideal_Opcode();
6524     int vlen_enc = Assembler::AVX_128bit;
6525     // Shift lower half and get word result in dst
6526     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6527 
6528     // Shift upper half and get word result in vtmp1
6529     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6530     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6531     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6532 
6533     // Merge and down convert the two word results to byte in dst
6534     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6535   %}
6536   ins_pipe( pipe_slow );
6537 %}
6538 
6539 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{
6540   predicate(Matcher::vector_length(n) == 32 &&
6541             n->as_ShiftV()->is_var_shift() &&
6542             !VM_Version::supports_avx512bw());
6543   match(Set dst ( LShiftVB src shift));
6544   match(Set dst ( RShiftVB src shift));
6545   match(Set dst (URShiftVB src shift));
6546   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch);
6547   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %}
6548   ins_encode %{
6549     assert(UseAVX >= 2, "required");
6550 
6551     int opcode = this->ideal_Opcode();
6552     int vlen_enc = Assembler::AVX_128bit;
6553     // Process lower 128 bits and get result in dst
6554     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6555     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6556     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6557     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6558     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6559 
6560     // Process higher 128 bits and get result in vtmp3
6561     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6562     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6563     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register);
6564     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
6565     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
6566     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6567     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
6568 
6569     // Merge the two results in dst
6570     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6571   %}
6572   ins_pipe( pipe_slow );
6573 %}
6574 
6575 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6576   predicate(Matcher::vector_length(n) <= 32 &&
6577             n->as_ShiftV()->is_var_shift() &&
6578             VM_Version::supports_avx512bw());
6579   match(Set dst ( LShiftVB src shift));
6580   match(Set dst ( RShiftVB src shift));
6581   match(Set dst (URShiftVB src shift));
6582   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6583   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6584   ins_encode %{
6585     assert(UseAVX > 2, "required");
6586 
6587     int opcode = this->ideal_Opcode();
6588     int vlen_enc = vector_length_encoding(this);
6589     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6590   %}
6591   ins_pipe( pipe_slow );
6592 %}
6593 
6594 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6595   predicate(Matcher::vector_length(n) == 64 &&
6596             n->as_ShiftV()->is_var_shift() &&
6597             VM_Version::supports_avx512bw());
6598   match(Set dst ( LShiftVB src shift));
6599   match(Set dst ( RShiftVB src shift));
6600   match(Set dst (URShiftVB src shift));
6601   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6602   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6603   ins_encode %{
6604     assert(UseAVX > 2, "required");
6605 
6606     int opcode = this->ideal_Opcode();
6607     int vlen_enc = Assembler::AVX_256bit;
6608     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6609     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6610     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6611     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6612     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6613   %}
6614   ins_pipe( pipe_slow );
6615 %}
6616 
6617 // Short variable shift
6618 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6619   predicate(Matcher::vector_length(n) <= 8 &&
6620             n->as_ShiftV()->is_var_shift() &&
6621             !VM_Version::supports_avx512bw());
6622   match(Set dst ( LShiftVS src shift));
6623   match(Set dst ( RShiftVS src shift));
6624   match(Set dst (URShiftVS src shift));
6625   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6626   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6627   ins_encode %{
6628     assert(UseAVX >= 2, "required");
6629 
6630     int opcode = this->ideal_Opcode();
6631     bool sign = (opcode != Op_URShiftVS);
6632     int vlen_enc = Assembler::AVX_256bit;
6633     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
6634     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
6635     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
6636     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6637     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
6638     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6639   %}
6640   ins_pipe( pipe_slow );
6641 %}
6642 
6643 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6644   predicate(Matcher::vector_length(n) == 16 &&
6645             n->as_ShiftV()->is_var_shift() &&
6646             !VM_Version::supports_avx512bw());
6647   match(Set dst ( LShiftVS src shift));
6648   match(Set dst ( RShiftVS src shift));
6649   match(Set dst (URShiftVS src shift));
6650   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6651   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6652   ins_encode %{
6653     assert(UseAVX >= 2, "required");
6654 
6655     int opcode = this->ideal_Opcode();
6656     bool sign = (opcode != Op_URShiftVS);
6657     int vlen_enc = Assembler::AVX_256bit;
6658     // Shift lower half, with result in vtmp2 usign vtmp1 as TEMP
6659     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6660     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6661     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6662     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6663 
6664     // Shift upper half, with result in dst usign vtmp1 as TEMP
6665     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
6666     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
6667     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6668     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6669     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6670     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6671 
6672     // Merge lower and upper half result into dst
6673     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6674     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6675   %}
6676   ins_pipe( pipe_slow );
6677 %}
6678 
6679 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
6680   predicate(n->as_ShiftV()->is_var_shift() &&
6681             VM_Version::supports_avx512bw());
6682   match(Set dst ( LShiftVS src shift));
6683   match(Set dst ( RShiftVS src shift));
6684   match(Set dst (URShiftVS src shift));
6685   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
6686   ins_encode %{
6687     assert(UseAVX > 2, "required");
6688 
6689     int opcode = this->ideal_Opcode();
6690     int vlen_enc = vector_length_encoding(this);
6691     if (!VM_Version::supports_avx512vl()) {
6692       vlen_enc = Assembler::AVX_512bit;
6693     }
6694     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6695   %}
6696   ins_pipe( pipe_slow );
6697 %}
6698 
6699 //Integer variable shift
6700 instruct vshiftI_var(vec dst, vec src, vec shift) %{
6701   predicate(n->as_ShiftV()->is_var_shift());
6702   match(Set dst ( LShiftVI src shift));
6703   match(Set dst ( RShiftVI src shift));
6704   match(Set dst (URShiftVI src shift));
6705   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
6706   ins_encode %{
6707     assert(UseAVX >= 2, "required");
6708 
6709     int opcode = this->ideal_Opcode();
6710     int vlen_enc = vector_length_encoding(this);
6711     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6712   %}
6713   ins_pipe( pipe_slow );
6714 %}
6715 
6716 //Long variable shift
6717 instruct vshiftL_var(vec dst, vec src, vec shift) %{
6718   predicate(n->as_ShiftV()->is_var_shift());
6719   match(Set dst ( LShiftVL src shift));
6720   match(Set dst (URShiftVL src shift));
6721   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
6722   ins_encode %{
6723     assert(UseAVX >= 2, "required");
6724 
6725     int opcode = this->ideal_Opcode();
6726     int vlen_enc = vector_length_encoding(this);
6727     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6728   %}
6729   ins_pipe( pipe_slow );
6730 %}
6731 
6732 //Long variable right shift arithmetic
6733 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
6734   predicate(Matcher::vector_length(n) <= 4 &&
6735             n->as_ShiftV()->is_var_shift() &&
6736             UseAVX == 2);
6737   match(Set dst (RShiftVL src shift));
6738   effect(TEMP dst, TEMP vtmp);
6739   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
6740   ins_encode %{
6741     int opcode = this->ideal_Opcode();
6742     int vlen_enc = vector_length_encoding(this);
6743     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
6744                  $vtmp$$XMMRegister);
6745   %}
6746   ins_pipe( pipe_slow );
6747 %}
6748 
6749 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
6750   predicate(n->as_ShiftV()->is_var_shift() &&
6751             UseAVX > 2);
6752   match(Set dst (RShiftVL src shift));
6753   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
6754   ins_encode %{
6755     int opcode = this->ideal_Opcode();
6756     int vlen_enc = vector_length_encoding(this);
6757     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6758   %}
6759   ins_pipe( pipe_slow );
6760 %}
6761 
6762 // --------------------------------- AND --------------------------------------
6763 
6764 instruct vand(vec dst, vec src) %{
6765   predicate(UseAVX == 0);
6766   match(Set dst (AndV dst src));
6767   format %{ "pand    $dst,$src\t! and vectors" %}
6768   ins_encode %{
6769     __ pand($dst$$XMMRegister, $src$$XMMRegister);
6770   %}
6771   ins_pipe( pipe_slow );
6772 %}
6773 
6774 instruct vand_reg(vec dst, vec src1, vec src2) %{
6775   predicate(UseAVX > 0);
6776   match(Set dst (AndV src1 src2));
6777   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
6778   ins_encode %{
6779     int vlen_enc = vector_length_encoding(this);
6780     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6781   %}
6782   ins_pipe( pipe_slow );
6783 %}
6784 
6785 instruct vand_mem(vec dst, vec src, memory mem) %{
6786   predicate((UseAVX > 0) &&
6787             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6788   match(Set dst (AndV src (LoadVector mem)));
6789   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
6790   ins_encode %{
6791     int vlen_enc = vector_length_encoding(this);
6792     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6793   %}
6794   ins_pipe( pipe_slow );
6795 %}
6796 
6797 // --------------------------------- OR ---------------------------------------
6798 
6799 instruct vor(vec dst, vec src) %{
6800   predicate(UseAVX == 0);
6801   match(Set dst (OrV dst src));
6802   format %{ "por     $dst,$src\t! or vectors" %}
6803   ins_encode %{
6804     __ por($dst$$XMMRegister, $src$$XMMRegister);
6805   %}
6806   ins_pipe( pipe_slow );
6807 %}
6808 
6809 instruct vor_reg(vec dst, vec src1, vec src2) %{
6810   predicate(UseAVX > 0);
6811   match(Set dst (OrV src1 src2));
6812   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
6813   ins_encode %{
6814     int vlen_enc = vector_length_encoding(this);
6815     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6816   %}
6817   ins_pipe( pipe_slow );
6818 %}
6819 
6820 instruct vor_mem(vec dst, vec src, memory mem) %{
6821   predicate((UseAVX > 0) &&
6822             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6823   match(Set dst (OrV src (LoadVector mem)));
6824   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
6825   ins_encode %{
6826     int vlen_enc = vector_length_encoding(this);
6827     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6828   %}
6829   ins_pipe( pipe_slow );
6830 %}
6831 
6832 // --------------------------------- XOR --------------------------------------
6833 
6834 instruct vxor(vec dst, vec src) %{
6835   predicate(UseAVX == 0);
6836   match(Set dst (XorV dst src));
6837   format %{ "pxor    $dst,$src\t! xor vectors" %}
6838   ins_encode %{
6839     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
6840   %}
6841   ins_pipe( pipe_slow );
6842 %}
6843 
6844 instruct vxor_reg(vec dst, vec src1, vec src2) %{
6845   predicate(UseAVX > 0);
6846   match(Set dst (XorV src1 src2));
6847   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
6848   ins_encode %{
6849     int vlen_enc = vector_length_encoding(this);
6850     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6851   %}
6852   ins_pipe( pipe_slow );
6853 %}
6854 
6855 instruct vxor_mem(vec dst, vec src, memory mem) %{
6856   predicate((UseAVX > 0) &&
6857             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6858   match(Set dst (XorV src (LoadVector mem)));
6859   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
6860   ins_encode %{
6861     int vlen_enc = vector_length_encoding(this);
6862     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6863   %}
6864   ins_pipe( pipe_slow );
6865 %}
6866 
6867 // --------------------------------- VectorCast --------------------------------------
6868 
6869 instruct vcastBtoX(vec dst, vec src) %{
6870   match(Set dst (VectorCastB2X src));
6871   format %{ "vector_cast_b2x $dst,$src\t!" %}
6872   ins_encode %{
6873     assert(UseAVX > 0, "required");
6874 
6875     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6876     int vlen_enc = vector_length_encoding(this);
6877     switch (to_elem_bt) {
6878       case T_SHORT:
6879         __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6880         break;
6881       case T_INT:
6882         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6883         break;
6884       case T_FLOAT:
6885         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6886         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6887         break;
6888       case T_LONG:
6889         __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6890         break;
6891       case T_DOUBLE:
6892         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6893         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6894         break;
6895 
6896       default: assert(false, "%s", type2name(to_elem_bt));
6897     }
6898   %}
6899   ins_pipe( pipe_slow );
6900 %}
6901 
6902 instruct castStoX(vec dst, vec src, rRegP scratch) %{
6903   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6904             Matcher::vector_length(n->in(1)) <= 8 && // src
6905             Matcher::vector_element_basic_type(n) == T_BYTE);
6906   effect(TEMP scratch);
6907   match(Set dst (VectorCastS2X src));
6908   format %{ "vector_cast_s2x $dst,$src\t! using $scratch as TEMP" %}
6909   ins_encode %{
6910     assert(UseAVX > 0, "required");
6911 
6912     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, $scratch$$Register);
6913     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6914   %}
6915   ins_pipe( pipe_slow );
6916 %}
6917 
6918 instruct vcastStoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6919   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6920             Matcher::vector_length(n->in(1)) == 16 && // src
6921             Matcher::vector_element_basic_type(n) == T_BYTE);
6922   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6923   match(Set dst (VectorCastS2X src));
6924   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp, $scratch as TEMP" %}
6925   ins_encode %{
6926     assert(UseAVX > 0, "required");
6927 
6928     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
6929     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6930     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
6931     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6932   %}
6933   ins_pipe( pipe_slow );
6934 %}
6935 
6936 instruct vcastStoX_evex(vec dst, vec src) %{
6937   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
6938             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
6939   match(Set dst (VectorCastS2X src));
6940   format %{ "vector_cast_s2x $dst,$src\t!" %}
6941   ins_encode %{
6942     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6943     int src_vlen_enc = vector_length_encoding(this, $src);
6944     int vlen_enc = vector_length_encoding(this);
6945     switch (to_elem_bt) {
6946       case T_BYTE:
6947         if (!VM_Version::supports_avx512vl()) {
6948           vlen_enc = Assembler::AVX_512bit;
6949         }
6950         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6951         break;
6952       case T_INT:
6953         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6954         break;
6955       case T_FLOAT:
6956         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6957         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6958         break;
6959       case T_LONG:
6960         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6961         break;
6962       case T_DOUBLE:
6963         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6964         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6965         break;
6966       default:
6967         ShouldNotReachHere();
6968     }
6969   %}
6970   ins_pipe( pipe_slow );
6971 %}
6972 
6973 instruct castItoX(vec dst, vec src, rRegP scratch) %{
6974   predicate(UseAVX <= 2 &&
6975             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
6976             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
6977   match(Set dst (VectorCastI2X src));
6978   format %{ "vector_cast_i2x $dst,$src\t! using $scratch as TEMP" %}
6979   effect(TEMP scratch);
6980   ins_encode %{
6981     assert(UseAVX > 0, "required");
6982 
6983     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6984     int vlen_enc = vector_length_encoding(this, $src);
6985 
6986     if (to_elem_bt == T_BYTE) {
6987       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
6988       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6989       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6990     } else {
6991       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
6992       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6993       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6994     }
6995   %}
6996   ins_pipe( pipe_slow );
6997 %}
6998 
6999 instruct vcastItoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
7000   predicate(UseAVX <= 2 &&
7001             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
7002             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
7003   match(Set dst (VectorCastI2X src));
7004   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp and $scratch as TEMP" %}
7005   effect(TEMP dst, TEMP vtmp, TEMP scratch);
7006   ins_encode %{
7007     assert(UseAVX > 0, "required");
7008 
7009     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7010     int vlen_enc = vector_length_encoding(this, $src);
7011 
7012     if (to_elem_bt == T_BYTE) {
7013       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
7014       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
7015       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7016       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7017     } else {
7018       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
7019       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
7020       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
7021       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7022     }
7023   %}
7024   ins_pipe( pipe_slow );
7025 %}
7026 
7027 instruct vcastItoX_evex(vec dst, vec src) %{
7028   predicate(UseAVX > 2 ||
7029             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
7030   match(Set dst (VectorCastI2X src));
7031   format %{ "vector_cast_i2x $dst,$src\t!" %}
7032   ins_encode %{
7033     assert(UseAVX > 0, "required");
7034 
7035     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
7036     int src_vlen_enc = vector_length_encoding(this, $src);
7037     int dst_vlen_enc = vector_length_encoding(this);
7038     switch (dst_elem_bt) {
7039       case T_BYTE:
7040         if (!VM_Version::supports_avx512vl()) {
7041           src_vlen_enc = Assembler::AVX_512bit;
7042         }
7043         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7044         break;
7045       case T_SHORT:
7046         if (!VM_Version::supports_avx512vl()) {
7047           src_vlen_enc = Assembler::AVX_512bit;
7048         }
7049         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7050         break;
7051       case T_FLOAT:
7052         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7053         break;
7054       case T_LONG:
7055         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7056         break;
7057       case T_DOUBLE:
7058         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7059         break;
7060       default:
7061         ShouldNotReachHere();
7062     }
7063   %}
7064   ins_pipe( pipe_slow );
7065 %}
7066 
7067 instruct vcastLtoBS(vec dst, vec src, rRegP scratch) %{
7068   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
7069             UseAVX <= 2);
7070   match(Set dst (VectorCastL2X src));
7071   effect(TEMP scratch);
7072   format %{ "vector_cast_l2x  $dst,$src\t! using $scratch as TEMP" %}
7073   ins_encode %{
7074     assert(UseAVX > 0, "required");
7075 
7076     int vlen = Matcher::vector_length_in_bytes(this, $src);
7077     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
7078     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
7079                                                       : ExternalAddress(vector_int_to_short_mask());
7080     if (vlen <= 16) {
7081       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
7082       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
7083       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7084     } else {
7085       assert(vlen <= 32, "required");
7086       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
7087       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
7088       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
7089       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7090     }
7091     if (to_elem_bt == T_BYTE) {
7092       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7093     }
7094   %}
7095   ins_pipe( pipe_slow );
7096 %}
7097 
7098 instruct vcastLtoX_evex(vec dst, vec src) %{
7099   predicate(UseAVX > 2 ||
7100             (Matcher::vector_element_basic_type(n) == T_INT ||
7101              Matcher::vector_element_basic_type(n) == T_FLOAT ||
7102              Matcher::vector_element_basic_type(n) == T_DOUBLE));
7103   match(Set dst (VectorCastL2X src));
7104   format %{ "vector_cast_l2x  $dst,$src\t!" %}
7105   ins_encode %{
7106     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7107     int vlen = Matcher::vector_length_in_bytes(this, $src);
7108     int vlen_enc = vector_length_encoding(this, $src);
7109     switch (to_elem_bt) {
7110       case T_BYTE:
7111         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
7112           vlen_enc = Assembler::AVX_512bit;
7113         }
7114         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7115         break;
7116       case T_SHORT:
7117         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
7118           vlen_enc = Assembler::AVX_512bit;
7119         }
7120         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7121         break;
7122       case T_INT:
7123         if (vlen == 8) {
7124           if ($dst$$XMMRegister != $src$$XMMRegister) {
7125             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
7126           }
7127         } else if (vlen == 16) {
7128           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
7129         } else if (vlen == 32) {
7130           if (UseAVX > 2) {
7131             if (!VM_Version::supports_avx512vl()) {
7132               vlen_enc = Assembler::AVX_512bit;
7133             }
7134             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7135           } else {
7136             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
7137             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
7138           }
7139         } else { // vlen == 64
7140           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7141         }
7142         break;
7143       case T_FLOAT:
7144         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
7145         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7146         break;
7147       case T_DOUBLE:
7148         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
7149         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7150         break;
7151 
7152       default: assert(false, "%s", type2name(to_elem_bt));
7153     }
7154   %}
7155   ins_pipe( pipe_slow );
7156 %}
7157 
7158 instruct vcastFtoD_reg(vec dst, vec src) %{
7159   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
7160   match(Set dst (VectorCastF2X src));
7161   format %{ "vector_cast_f2x  $dst,$src\t!" %}
7162   ins_encode %{
7163     int vlen_enc = vector_length_encoding(this);
7164     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7165   %}
7166   ins_pipe( pipe_slow );
7167 %}
7168 
7169 instruct vcastDtoF_reg(vec dst, vec src) %{
7170   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
7171   match(Set dst (VectorCastD2X src));
7172   format %{ "vector_cast_d2x  $dst,$src\t!" %}
7173   ins_encode %{
7174     int vlen_enc = vector_length_encoding(this, $src);
7175     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7176   %}
7177   ins_pipe( pipe_slow );
7178 %}
7179 
7180 // --------------------------------- VectorMaskCmp --------------------------------------
7181 
7182 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
7183   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7184             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
7185             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7186             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7187   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7188   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
7189   ins_encode %{
7190     int vlen_enc = vector_length_encoding(this, $src1);
7191     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7192     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7193       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7194     } else {
7195       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7196     }
7197   %}
7198   ins_pipe( pipe_slow );
7199 %}
7200 
7201 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
7202   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
7203             n->bottom_type()->isa_vectmask() == NULL &&
7204             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7205   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7206   effect(TEMP scratch, TEMP ktmp);
7207   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7208   ins_encode %{
7209     int vlen_enc = Assembler::AVX_512bit;
7210     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7211     KRegister mask = k0; // The comparison itself is not being masked.
7212     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7213       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7214       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
7215     } else {
7216       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7217       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
7218     }
7219   %}
7220   ins_pipe( pipe_slow );
7221 %}
7222 
7223 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
7224   predicate(n->bottom_type()->isa_vectmask() &&
7225             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7226   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7227   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
7228   ins_encode %{
7229     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
7230     int vlen_enc = vector_length_encoding(this, $src1);
7231     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7232     KRegister mask = k0; // The comparison itself is not being masked.
7233     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7234       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7235     } else {
7236       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7237     }
7238   %}
7239   ins_pipe( pipe_slow );
7240 %}
7241 
7242 instruct vcmp(legVec dst, legVec src1, legVec src2, immI8 cond, rRegP scratch) %{
7243   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7244             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7245             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
7246             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7247             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7248   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7249   effect(TEMP scratch);
7250   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7251   ins_encode %{
7252     int vlen_enc = vector_length_encoding(this, $src1);
7253     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7254     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
7255     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, ww, vlen_enc, $scratch$$Register);
7256   %}
7257   ins_pipe( pipe_slow );
7258 %}
7259 
7260 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
7261   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7262             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7263             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
7264             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 16 && // src1
7265             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7266   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7267   effect(TEMP vtmp1, TEMP vtmp2, TEMP scratch);
7268   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7269   ins_encode %{
7270     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7271     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7272     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
7273     __ vpcmpu(bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen, $vtmp1$$XMMRegister,
7274               $vtmp2$$XMMRegister, $scratch$$Register);
7275   %}
7276   ins_pipe( pipe_slow );
7277 %}
7278 
7279 instruct vcmpu32(legVec dst, legVec src1, legVec src2, immI8 cond, legVec vtmp1, legVec vtmp2, legVec vtmp3, rRegP scratch) %{
7280   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7281             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7282             Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 32 && // src1
7283             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7284   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7285   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP scratch);
7286   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7287   ins_encode %{
7288     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7289     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7290     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
7291     __ vpcmpu32(bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen, $vtmp1$$XMMRegister,
7292                 $vtmp2$$XMMRegister, $vtmp3$$XMMRegister, $scratch$$Register);
7293   %}
7294   ins_pipe( pipe_slow );
7295 %}
7296 
7297 instruct vcmpu64(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
7298   predicate((n->bottom_type()->isa_vectmask() == NULL &&
7299              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
7300              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7301   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7302   effect(TEMP scratch, TEMP ktmp);
7303   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7304   ins_encode %{
7305     assert(UseAVX > 2, "required");
7306 
7307     int vlen_enc = vector_length_encoding(this, $src1);
7308     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7309     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
7310     KRegister mask = k0; // The comparison itself is not being masked.
7311     bool merge = false;
7312     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
7313 
7314     switch (src1_elem_bt) {
7315       case T_INT: {
7316         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7317         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7318         break;
7319       }
7320       case T_LONG: {
7321         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7322         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7323         break;
7324       }
7325       default: assert(false, "%s", type2name(src1_elem_bt));
7326     }
7327   %}
7328   ins_pipe( pipe_slow );
7329 %}
7330 
7331 
7332 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
7333   predicate(n->bottom_type()->isa_vectmask() &&
7334             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7335   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7336   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
7337   ins_encode %{
7338     assert(UseAVX > 2, "required");
7339     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
7340 
7341     int vlen_enc = vector_length_encoding(this, $src1);
7342     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7343     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
7344     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
7345 
7346     // Comparison i
7347     switch (src1_elem_bt) {
7348       case T_BYTE: {
7349         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7350         break;
7351       }
7352       case T_SHORT: {
7353         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7354         break;
7355       }
7356       case T_INT: {
7357         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7358         break;
7359       }
7360       case T_LONG: {
7361         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7362         break;
7363       }
7364       default: assert(false, "%s", type2name(src1_elem_bt));
7365     }
7366   %}
7367   ins_pipe( pipe_slow );
7368 %}
7369 
7370 // Extract
7371 
7372 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
7373   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
7374   match(Set dst (ExtractI src idx));
7375   match(Set dst (ExtractS src idx));
7376 #ifdef _LP64
7377   match(Set dst (ExtractB src idx));
7378 #endif
7379   format %{ "extractI $dst,$src,$idx\t!" %}
7380   ins_encode %{
7381     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7382 
7383     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7384     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7385   %}
7386   ins_pipe( pipe_slow );
7387 %}
7388 
7389 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
7390   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
7391             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
7392   match(Set dst (ExtractI src idx));
7393   match(Set dst (ExtractS src idx));
7394 #ifdef _LP64
7395   match(Set dst (ExtractB src idx));
7396 #endif
7397   effect(TEMP vtmp);
7398   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
7399   ins_encode %{
7400     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7401 
7402     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7403     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7404     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
7405   %}
7406   ins_pipe( pipe_slow );
7407 %}
7408 
7409 #ifdef _LP64
7410 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
7411   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
7412   match(Set dst (ExtractL src idx));
7413   format %{ "extractL $dst,$src,$idx\t!" %}
7414   ins_encode %{
7415     assert(UseSSE >= 4, "required");
7416     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7417 
7418     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7419   %}
7420   ins_pipe( pipe_slow );
7421 %}
7422 
7423 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
7424   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7425             Matcher::vector_length(n->in(1)) == 8);  // src
7426   match(Set dst (ExtractL src idx));
7427   effect(TEMP vtmp);
7428   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
7429   ins_encode %{
7430     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7431 
7432     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7433     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
7434   %}
7435   ins_pipe( pipe_slow );
7436 %}
7437 #endif
7438 
7439 instruct extractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7440   predicate(Matcher::vector_length(n->in(1)) <= 4);
7441   match(Set dst (ExtractF src idx));
7442   effect(TEMP dst, TEMP tmp, TEMP vtmp);
7443   format %{ "extractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7444   ins_encode %{
7445     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7446 
7447     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $tmp$$Register, $vtmp$$XMMRegister);
7448   %}
7449   ins_pipe( pipe_slow );
7450 %}
7451 
7452 instruct vextractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7453   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
7454             Matcher::vector_length(n->in(1)/*src*/) == 16);
7455   match(Set dst (ExtractF src idx));
7456   effect(TEMP tmp, TEMP vtmp);
7457   format %{ "vextractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7458   ins_encode %{
7459     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7460 
7461     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7462     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant, $tmp$$Register);
7463   %}
7464   ins_pipe( pipe_slow );
7465 %}
7466 
7467 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
7468   predicate(Matcher::vector_length(n->in(1)) == 2); // src
7469   match(Set dst (ExtractD src idx));
7470   format %{ "extractD $dst,$src,$idx\t!" %}
7471   ins_encode %{
7472     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7473 
7474     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7475   %}
7476   ins_pipe( pipe_slow );
7477 %}
7478 
7479 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
7480   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7481             Matcher::vector_length(n->in(1)) == 8);  // src
7482   match(Set dst (ExtractD src idx));
7483   effect(TEMP vtmp);
7484   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
7485   ins_encode %{
7486     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7487 
7488     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7489     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
7490   %}
7491   ins_pipe( pipe_slow );
7492 %}
7493 
7494 // --------------------------------- Vector Blend --------------------------------------
7495 
7496 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
7497   predicate(UseAVX == 0);
7498   match(Set dst (VectorBlend (Binary dst src) mask));
7499   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
7500   effect(TEMP tmp);
7501   ins_encode %{
7502     assert(UseSSE >= 4, "required");
7503 
7504     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
7505       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
7506     }
7507     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
7508   %}
7509   ins_pipe( pipe_slow );
7510 %}
7511 
7512 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
7513   predicate(UseAVX > 0 &&
7514             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
7515             Matcher::vector_length_in_bytes(n) <= 32 &&
7516             is_integral_type(Matcher::vector_element_basic_type(n)));
7517   match(Set dst (VectorBlend (Binary src1 src2) mask));
7518   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7519   ins_encode %{
7520     int vlen_enc = vector_length_encoding(this);
7521     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7522   %}
7523   ins_pipe( pipe_slow );
7524 %}
7525 
7526 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
7527   predicate(UseAVX > 0 &&
7528             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
7529             Matcher::vector_length_in_bytes(n) <= 32 &&
7530             !is_integral_type(Matcher::vector_element_basic_type(n)));
7531   match(Set dst (VectorBlend (Binary src1 src2) mask));
7532   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7533   ins_encode %{
7534     int vlen_enc = vector_length_encoding(this);
7535     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7536   %}
7537   ins_pipe( pipe_slow );
7538 %}
7539 
7540 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, rRegP scratch, kReg ktmp) %{
7541   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
7542             n->in(2)->bottom_type()->isa_vectmask() == NULL);
7543   match(Set dst (VectorBlend (Binary src1 src2) mask));
7544   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
7545   effect(TEMP scratch, TEMP ktmp);
7546   ins_encode %{
7547      int vlen_enc = Assembler::AVX_512bit;
7548      BasicType elem_bt = Matcher::vector_element_basic_type(this);
7549     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, $scratch$$Register);
7550     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
7551   %}
7552   ins_pipe( pipe_slow );
7553 %}
7554 
7555 
7556 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask, rRegP scratch) %{
7557   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
7558             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
7559              VM_Version::supports_avx512bw()));
7560   match(Set dst (VectorBlend (Binary src1 src2) mask));
7561   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
7562   effect(TEMP scratch);
7563   ins_encode %{
7564     int vlen_enc = vector_length_encoding(this);
7565     BasicType elem_bt = Matcher::vector_element_basic_type(this);
7566     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
7567   %}
7568   ins_pipe( pipe_slow );
7569 %}
7570 
7571 // --------------------------------- ABS --------------------------------------
7572 // a = |a|
7573 instruct vabsB_reg(vec dst, vec src) %{
7574   match(Set dst (AbsVB  src));
7575   ins_cost(450);
7576   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
7577   ins_encode %{
7578     uint vlen = Matcher::vector_length(this);
7579     if (vlen <= 16) {
7580       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7581     } else {
7582       int vlen_enc = vector_length_encoding(this);
7583       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7584     }
7585   %}
7586   ins_pipe( pipe_slow );
7587 %}
7588 
7589 instruct vabsS_reg(vec dst, vec src) %{
7590   match(Set dst (AbsVS  src));
7591   ins_cost(450);
7592   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
7593   ins_encode %{
7594     uint vlen = Matcher::vector_length(this);
7595     if (vlen <= 8) {
7596       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7597     } else {
7598       int vlen_enc = vector_length_encoding(this);
7599       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7600     }
7601   %}
7602   ins_pipe( pipe_slow );
7603 %}
7604 
7605 instruct vabsI_reg(vec dst, vec src) %{
7606   match(Set dst (AbsVI  src));
7607   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
7608   ins_cost(250);
7609   ins_encode %{
7610     uint vlen = Matcher::vector_length(this);
7611     if (vlen <= 4) {
7612       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7613     } else {
7614       int vlen_enc = vector_length_encoding(this);
7615       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7616     }
7617   %}
7618   ins_pipe( pipe_slow );
7619 %}
7620 
7621 instruct vabsL_reg(vec dst, vec src) %{
7622   match(Set dst (AbsVL  src));
7623   ins_cost(450);
7624   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
7625   ins_encode %{
7626     assert(UseAVX > 2, "required");
7627     int vlen_enc = vector_length_encoding(this);
7628     if (!VM_Version::supports_avx512vl()) {
7629       vlen_enc = Assembler::AVX_512bit;
7630     }
7631     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7632   %}
7633   ins_pipe( pipe_slow );
7634 %}
7635 
7636 // --------------------------------- ABSNEG --------------------------------------
7637 
7638 instruct vabsnegF(vec dst, vec src, rRegI scratch) %{
7639   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
7640   match(Set dst (AbsVF src));
7641   match(Set dst (NegVF src));
7642   effect(TEMP scratch);
7643   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
7644   ins_cost(150);
7645   ins_encode %{
7646     int opcode = this->ideal_Opcode();
7647     int vlen = Matcher::vector_length(this);
7648     if (vlen == 2) {
7649       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7650     } else {
7651       assert(vlen == 8 || vlen == 16, "required");
7652       int vlen_enc = vector_length_encoding(this);
7653       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7654     }
7655   %}
7656   ins_pipe( pipe_slow );
7657 %}
7658 
7659 instruct vabsneg4F(vec dst, rRegI scratch) %{
7660   predicate(Matcher::vector_length(n) == 4);
7661   match(Set dst (AbsVF dst));
7662   match(Set dst (NegVF dst));
7663   effect(TEMP scratch);
7664   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
7665   ins_cost(150);
7666   ins_encode %{
7667     int opcode = this->ideal_Opcode();
7668     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $scratch$$Register);
7669   %}
7670   ins_pipe( pipe_slow );
7671 %}
7672 
7673 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
7674   match(Set dst (AbsVD  src));
7675   match(Set dst (NegVD  src));
7676   effect(TEMP scratch);
7677   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
7678   ins_encode %{
7679     int opcode = this->ideal_Opcode();
7680     uint vlen = Matcher::vector_length(this);
7681     if (vlen == 2) {
7682       assert(UseSSE >= 2, "required");
7683       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7684     } else {
7685       int vlen_enc = vector_length_encoding(this);
7686       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7687     }
7688   %}
7689   ins_pipe( pipe_slow );
7690 %}
7691 
7692 //------------------------------------- VectorTest --------------------------------------------
7693 
7694 #ifdef _LP64
7695 instruct vptest_alltrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp1, legVec vtmp2, rFlagsReg cr) %{
7696   predicate(!VM_Version::supports_avx512bwdq() &&
7697             Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
7698             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
7699             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7700   match(Set dst (VectorTest src1 src2 ));
7701   effect(TEMP vtmp1, TEMP vtmp2, KILL cr);
7702   format %{ "vptest_alltrue_lt16 $dst,$src1, $src2\t! using $vtmp1, $vtmp2 and $cr as TEMP" %}
7703   ins_encode %{
7704     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7705     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
7706     __ setb(Assembler::carrySet, $dst$$Register);
7707     __ movzbl($dst$$Register, $dst$$Register);
7708   %}
7709   ins_pipe( pipe_slow );
7710 %}
7711 
7712 instruct vptest_alltrue_ge16(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7713   predicate(!VM_Version::supports_avx512bwdq() &&
7714             Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
7715             Matcher::vector_length_in_bytes(n->in(1)) <  64 &&
7716             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7717   match(Set dst (VectorTest src1 src2 ));
7718   effect(KILL cr);
7719   format %{ "vptest_alltrue_ge16  $dst,$src1, $src2\t! using $cr as TEMP" %}
7720   ins_encode %{
7721     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7722     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7723     __ setb(Assembler::carrySet, $dst$$Register);
7724     __ movzbl($dst$$Register, $dst$$Register);
7725   %}
7726   ins_pipe( pipe_slow );
7727 %}
7728 
7729 instruct vptest_alltrue_lt8_evex(rRegI dst, kReg src1, kReg src2, kReg kscratch, rFlagsReg cr) %{
7730   predicate(VM_Version::supports_avx512bwdq() &&
7731             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow &&
7732             n->in(1)->bottom_type()->isa_vectmask() &&
7733             Matcher::vector_length(n->in(1)) < 8);
7734   match(Set dst (VectorTest src1 src2));
7735   effect(KILL cr, TEMP kscratch);
7736   format %{ "vptest_alltrue_lt8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
7737   ins_encode %{
7738     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7739     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7740     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7741     uint masklen = Matcher::vector_length(this, $src1);
7742     __ alltrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister, $kscratch$$KRegister);
7743   %}
7744   ins_pipe( pipe_slow );
7745 %}
7746 
7747 
7748 instruct vptest_alltrue_ge8_evex(rRegI dst, kReg src1, kReg src2, rFlagsReg cr) %{
7749   predicate(VM_Version::supports_avx512bwdq() &&
7750             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow &&
7751             n->in(1)->bottom_type()->isa_vectmask() &&
7752             Matcher::vector_length(n->in(1)) >= 8);
7753   match(Set dst (VectorTest src1 src2));
7754   effect(KILL cr);
7755   format %{ "vptest_alltrue_ge8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
7756   ins_encode %{
7757     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7758     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7759     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7760     uint masklen = Matcher::vector_length(this, $src1);
7761     __ alltrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister, knoreg);
7762   %}
7763   ins_pipe( pipe_slow );
7764 %}
7765 
7766 
7767 instruct vptest_anytrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp, rFlagsReg cr) %{
7768   predicate(!VM_Version::supports_avx512bwdq() &&
7769             Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
7770             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
7771             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7772   match(Set dst (VectorTest src1 src2 ));
7773   effect(TEMP vtmp, KILL cr);
7774   format %{ "vptest_anytrue_lt16 $dst,$src1,$src2\t! using $vtmp, $cr as TEMP" %}
7775   ins_encode %{
7776     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7777     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7778     __ setb(Assembler::notZero, $dst$$Register);
7779     __ movzbl($dst$$Register, $dst$$Register);
7780   %}
7781   ins_pipe( pipe_slow );
7782 %}
7783 
7784 instruct vptest_anytrue_ge16(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7785   predicate(!VM_Version::supports_avx512bwdq() &&
7786             Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
7787             Matcher::vector_length_in_bytes(n->in(1)) < 64  &&
7788             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7789   match(Set dst (VectorTest src1 src2 ));
7790   effect(KILL cr);
7791   format %{ "vptest_anytrue_ge16 $dst,$src1,$src2\t! using $cr as TEMP" %}
7792   ins_encode %{
7793     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7794     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7795     __ setb(Assembler::notZero, $dst$$Register);
7796     __ movzbl($dst$$Register, $dst$$Register);
7797   %}
7798   ins_pipe( pipe_slow );
7799 %}
7800 
7801 instruct vptest_anytrue_evex(rRegI dst, kReg src1, kReg src2, rFlagsReg cr) %{
7802   predicate(VM_Version::supports_avx512bwdq() &&
7803             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7804   match(Set dst (VectorTest src1 src2));
7805   effect(KILL cr);
7806   format %{ "vptest_anytrue_lt8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
7807   ins_encode %{
7808     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7809     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7810     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7811     uint  masklen = Matcher::vector_length(this, $src1);
7812     __ anytrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister);
7813   %}
7814   ins_pipe( pipe_slow );
7815 %}
7816 
7817 instruct cmpvptest_anytrue_lt16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero, legVec vtmp) %{
7818   predicate(!VM_Version::supports_avx512bwdq() &&
7819             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 4 &&
7820             Matcher::vector_length_in_bytes(n->in(1)->in(1)) < 16 &&
7821             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7822   match(Set cr (CmpI (VectorTest src1 src2) zero));
7823   effect(TEMP vtmp);
7824   format %{ "cmpvptest_anytrue_lt16 $src1,$src2\t! using $vtmp as TEMP" %}
7825   ins_encode %{
7826     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7827     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7828   %}
7829   ins_pipe( pipe_slow );
7830 %}
7831 
7832 instruct cmpvptest_anytrue_ge16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero) %{
7833   predicate(!VM_Version::supports_avx512bwdq() &&
7834             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 16 &&
7835             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <  64 &&
7836             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7837   match(Set cr (CmpI (VectorTest src1 src2) zero));
7838   format %{ "cmpvptest_anytrue_ge16 $src1,$src2\t!" %}
7839   ins_encode %{
7840     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7841     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7842   %}
7843   ins_pipe( pipe_slow );
7844 %}
7845 
7846 instruct cmpvptest_anytrue_evex(rFlagsReg cr, kReg src1, kReg src2, immI_0 zero) %{
7847   predicate(VM_Version::supports_avx512bwdq() &&
7848             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7849   match(Set cr (CmpI (VectorTest src1 src2) zero));
7850   format %{ "cmpvptest_anytrue_evex $src1,$src2\t!" %}
7851   ins_encode %{
7852     uint masklen = Matcher::vector_length(this, $src1);
7853     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7854     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7855     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7856     masklen = masklen < 8 ? 8 : masklen;
7857     __ ktest(masklen, $src1$$KRegister, $src2$$KRegister);
7858   %}
7859   ins_pipe( pipe_slow );
7860 %}
7861 #endif
7862 
7863 //------------------------------------- LoadMask --------------------------------------------
7864 
7865 instruct loadMask(legVec dst, legVec src) %{
7866   predicate(n->bottom_type()->isa_vectmask() == NULL && !VM_Version::supports_avx512vlbw());
7867   match(Set dst (VectorLoadMask src));
7868   effect(TEMP dst);
7869   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
7870   ins_encode %{
7871     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
7872     BasicType elem_bt = Matcher::vector_element_basic_type(this);
7873     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
7874   %}
7875   ins_pipe( pipe_slow );
7876 %}
7877 
7878 instruct loadMask64(kReg dst, vec src, vec xtmp, rRegI tmp) %{
7879   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
7880   match(Set dst (VectorLoadMask src));
7881   effect(TEMP xtmp, TEMP tmp);
7882   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp and $tmp as TEMP" %}
7883   ins_encode %{
7884     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
7885                         $tmp$$Register, true, Assembler::AVX_512bit);
7886   %}
7887   ins_pipe( pipe_slow );
7888 %}
7889 
7890 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
7891   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
7892   match(Set dst (VectorLoadMask src));
7893   effect(TEMP xtmp);
7894   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
7895   ins_encode %{
7896     int vlen_enc = vector_length_encoding(in(1));
7897     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
7898                         noreg, false, vlen_enc);
7899   %}
7900   ins_pipe( pipe_slow );
7901 %}
7902 
7903 //------------------------------------- StoreMask --------------------------------------------
7904 
7905 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
7906   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
7907   match(Set dst (VectorStoreMask src size));
7908   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
7909   ins_encode %{
7910     int vlen = Matcher::vector_length(this);
7911     if (vlen <= 16 && UseAVX <= 2) {
7912       assert(UseSSE >= 3, "required");
7913       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7914     } else {
7915       assert(UseAVX > 0, "required");
7916       int src_vlen_enc = vector_length_encoding(this, $src);
7917       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7918     }
7919   %}
7920   ins_pipe( pipe_slow );
7921 %}
7922 
7923 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
7924   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
7925   match(Set dst (VectorStoreMask src size));
7926   effect(TEMP_DEF dst, TEMP xtmp);
7927   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
7928   ins_encode %{
7929     int vlen_enc = Assembler::AVX_128bit;
7930     int vlen = Matcher::vector_length(this);
7931     if (vlen <= 8) {
7932       assert(UseSSE >= 3, "required");
7933       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
7934       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7935       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
7936     } else {
7937       assert(UseAVX > 0, "required");
7938       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
7939       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7940       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7941     }
7942   %}
7943   ins_pipe( pipe_slow );
7944 %}
7945 
7946 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
7947   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
7948   match(Set dst (VectorStoreMask src size));
7949   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
7950   effect(TEMP_DEF dst, TEMP xtmp);
7951   ins_encode %{
7952     int vlen_enc = Assembler::AVX_128bit;
7953     int vlen = Matcher::vector_length(this);
7954     if (vlen <= 4) {
7955       assert(UseSSE >= 3, "required");
7956       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
7957       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7958       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
7959       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
7960     } else {
7961       assert(UseAVX > 0, "required");
7962       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
7963       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
7964       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7965       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
7966       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7967     }
7968   %}
7969   ins_pipe( pipe_slow );
7970 %}
7971 
7972 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
7973   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
7974   match(Set dst (VectorStoreMask src size));
7975   effect(TEMP_DEF dst, TEMP xtmp);
7976   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
7977   ins_encode %{
7978     assert(UseSSE >= 3, "required");
7979     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
7980     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
7981     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
7982     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
7983     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
7984   %}
7985   ins_pipe( pipe_slow );
7986 %}
7987 
7988 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
7989   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
7990   match(Set dst (VectorStoreMask src size));
7991   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
7992   effect(TEMP_DEF dst, TEMP vtmp);
7993   ins_encode %{
7994     int vlen_enc = Assembler::AVX_128bit;
7995     __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
7996     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
7997     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
7998     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
7999     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8000     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8001     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8002   %}
8003   ins_pipe( pipe_slow );
8004 %}
8005 
8006 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
8007   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8008   match(Set dst (VectorStoreMask src size));
8009   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8010   ins_encode %{
8011     int src_vlen_enc = vector_length_encoding(this, $src);
8012     int dst_vlen_enc = vector_length_encoding(this);
8013     if (!VM_Version::supports_avx512vl()) {
8014       src_vlen_enc = Assembler::AVX_512bit;
8015     }
8016     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
8017     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8018   %}
8019   ins_pipe( pipe_slow );
8020 %}
8021 
8022 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
8023   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8024   match(Set dst (VectorStoreMask src size));
8025   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8026   ins_encode %{
8027     int src_vlen_enc = vector_length_encoding(this, $src);
8028     int dst_vlen_enc = vector_length_encoding(this);
8029     if (!VM_Version::supports_avx512vl()) {
8030       src_vlen_enc = Assembler::AVX_512bit;
8031     }
8032     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
8033     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8034   %}
8035   ins_pipe( pipe_slow );
8036 %}
8037 
8038 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size, rRegI tmp) %{
8039   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
8040   match(Set dst (VectorStoreMask mask size));
8041   effect(TEMP_DEF dst, TEMP tmp);
8042   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
8043   ins_encode %{
8044     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
8045     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
8046                  false, Assembler::AVX_512bit, $tmp$$Register);
8047     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
8048   %}
8049   ins_pipe( pipe_slow );
8050 %}
8051 
8052 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
8053   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
8054   match(Set dst (VectorStoreMask mask size));
8055   effect(TEMP_DEF dst);
8056   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
8057   ins_encode %{
8058     int dst_vlen_enc = vector_length_encoding(this);
8059     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
8060     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8061   %}
8062   ins_pipe( pipe_slow );
8063 %}
8064 
8065 instruct vmaskcast_evex(kReg dst) %{
8066   predicate(Matcher::vector_length(n) == Matcher::vector_length(n->in(1)));
8067   match(Set dst (VectorMaskCast dst));
8068   ins_cost(0);
8069   format %{ "vector_mask_cast $dst" %}
8070   ins_encode %{
8071     // empty
8072   %}
8073   ins_pipe(empty);
8074 %}
8075 
8076 instruct vmaskcast(vec dst) %{
8077   predicate((Matcher::vector_length(n) == Matcher::vector_length(n->in(1))) &&
8078             (Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))));
8079   match(Set dst (VectorMaskCast dst));
8080   ins_cost(0);
8081   format %{ "vector_mask_cast $dst" %}
8082   ins_encode %{
8083     // empty
8084   %}
8085   ins_pipe(empty);
8086 %}
8087 
8088 //-------------------------------- Load Iota Indices ----------------------------------
8089 
8090 instruct loadIotaIndices(vec dst, immI_0 src, rRegP scratch) %{
8091   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
8092   match(Set dst (VectorLoadConst src));
8093   effect(TEMP scratch);
8094   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
8095   ins_encode %{
8096      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8097      __ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen_in_bytes);
8098   %}
8099   ins_pipe( pipe_slow );
8100 %}
8101 
8102 //-------------------------------- Rearrange ----------------------------------
8103 
8104 // LoadShuffle/Rearrange for Byte
8105 
8106 instruct loadShuffleB(vec dst) %{
8107   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
8108   match(Set dst (VectorLoadShuffle dst));
8109   format %{ "vector_load_shuffle $dst, $dst" %}
8110   ins_encode %{
8111     // empty
8112   %}
8113   ins_pipe( pipe_slow );
8114 %}
8115 
8116 instruct rearrangeB(vec dst, vec shuffle) %{
8117   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8118             Matcher::vector_length(n) < 32);
8119   match(Set dst (VectorRearrange dst shuffle));
8120   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8121   ins_encode %{
8122     assert(UseSSE >= 4, "required");
8123     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8124   %}
8125   ins_pipe( pipe_slow );
8126 %}
8127 
8128 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
8129   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8130             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
8131   match(Set dst (VectorRearrange src shuffle));
8132   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
8133   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
8134   ins_encode %{
8135     assert(UseAVX >= 2, "required");
8136     // Swap src into vtmp1
8137     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
8138     // Shuffle swapped src to get entries from other 128 bit lane
8139     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8140     // Shuffle original src to get entries from self 128 bit lane
8141     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8142     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
8143     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
8144     // Perform the blend
8145     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
8146   %}
8147   ins_pipe( pipe_slow );
8148 %}
8149 
8150 instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{
8151   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8152             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
8153   match(Set dst (VectorRearrange src shuffle));
8154   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8155   ins_encode %{
8156     int vlen_enc = vector_length_encoding(this);
8157     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8158   %}
8159   ins_pipe( pipe_slow );
8160 %}
8161 
8162 // LoadShuffle/Rearrange for Short
8163 
8164 instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{
8165   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8166             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
8167   match(Set dst (VectorLoadShuffle src));
8168   effect(TEMP dst, TEMP vtmp, TEMP scratch);
8169   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
8170   ins_encode %{
8171     // Create a byte shuffle mask from short shuffle mask
8172     // only byte shuffle instruction available on these platforms
8173     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8174     if (UseAVX == 0) {
8175       assert(vlen_in_bytes <= 16, "required");
8176       // Multiply each shuffle by two to get byte index
8177       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
8178       __ psllw($vtmp$$XMMRegister, 1);
8179 
8180       // Duplicate to create 2 copies of byte index
8181       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
8182       __ psllw($dst$$XMMRegister, 8);
8183       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
8184 
8185       // Add one to get alternate byte index
8186       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
8187       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
8188     } else {
8189       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
8190       int vlen_enc = vector_length_encoding(this);
8191       // Multiply each shuffle by two to get byte index
8192       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
8193       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
8194 
8195       // Duplicate to create 2 copies of byte index
8196       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
8197       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8198 
8199       // Add one to get alternate byte index
8200       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, $scratch$$Register);
8201     }
8202   %}
8203   ins_pipe( pipe_slow );
8204 %}
8205 
8206 instruct rearrangeS(vec dst, vec shuffle) %{
8207   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8208             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
8209   match(Set dst (VectorRearrange dst shuffle));
8210   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8211   ins_encode %{
8212     assert(UseSSE >= 4, "required");
8213     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8214   %}
8215   ins_pipe( pipe_slow );
8216 %}
8217 
8218 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
8219   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8220             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
8221   match(Set dst (VectorRearrange src shuffle));
8222   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
8223   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
8224   ins_encode %{
8225     assert(UseAVX >= 2, "required");
8226     // Swap src into vtmp1
8227     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
8228     // Shuffle swapped src to get entries from other 128 bit lane
8229     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8230     // Shuffle original src to get entries from self 128 bit lane
8231     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8232     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
8233     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
8234     // Perform the blend
8235     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
8236   %}
8237   ins_pipe( pipe_slow );
8238 %}
8239 
8240 instruct loadShuffleS_evex(vec dst, vec src) %{
8241   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8242             VM_Version::supports_avx512bw());
8243   match(Set dst (VectorLoadShuffle src));
8244   format %{ "vector_load_shuffle $dst, $src" %}
8245   ins_encode %{
8246     int vlen_enc = vector_length_encoding(this);
8247     if (!VM_Version::supports_avx512vl()) {
8248       vlen_enc = Assembler::AVX_512bit;
8249     }
8250     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8251   %}
8252   ins_pipe( pipe_slow );
8253 %}
8254 
8255 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
8256   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8257             VM_Version::supports_avx512bw());
8258   match(Set dst (VectorRearrange src shuffle));
8259   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8260   ins_encode %{
8261     int vlen_enc = vector_length_encoding(this);
8262     if (!VM_Version::supports_avx512vl()) {
8263       vlen_enc = Assembler::AVX_512bit;
8264     }
8265     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8266   %}
8267   ins_pipe( pipe_slow );
8268 %}
8269 
8270 // LoadShuffle/Rearrange for Integer and Float
8271 
8272 instruct loadShuffleI(vec dst, vec src, vec vtmp, rRegP scratch) %{
8273   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8274             Matcher::vector_length(n) == 4 && UseAVX < 2);
8275   match(Set dst (VectorLoadShuffle src));
8276   effect(TEMP dst, TEMP vtmp, TEMP scratch);
8277   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
8278   ins_encode %{
8279     assert(UseSSE >= 4, "required");
8280 
8281     // Create a byte shuffle mask from int shuffle mask
8282     // only byte shuffle instruction available on these platforms
8283 
8284     // Duplicate and multiply each shuffle by 4
8285     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
8286     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
8287     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
8288     __ psllw($vtmp$$XMMRegister, 2);
8289 
8290     // Duplicate again to create 4 copies of byte index
8291     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
8292     __ psllw($dst$$XMMRegister, 8);
8293     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
8294 
8295     // Add 3,2,1,0 to get alternate byte index
8296     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), $scratch$$Register);
8297     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
8298   %}
8299   ins_pipe( pipe_slow );
8300 %}
8301 
8302 instruct rearrangeI(vec dst, vec shuffle) %{
8303  predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8304            Matcher::vector_length(n) == 4 && UseAVX < 2);
8305   match(Set dst (VectorRearrange dst shuffle));
8306   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8307   ins_encode %{
8308     assert(UseSSE >= 4, "required");
8309     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8310   %}
8311   ins_pipe( pipe_slow );
8312 %}
8313 
8314 instruct loadShuffleI_avx(vec dst, vec src) %{
8315   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8316             UseAVX >= 2);
8317   match(Set dst (VectorLoadShuffle src));
8318   format %{ "vector_load_shuffle $dst, $src" %}
8319   ins_encode %{
8320   int vlen_enc = vector_length_encoding(this);
8321     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8322   %}
8323   ins_pipe( pipe_slow );
8324 %}
8325 
8326 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
8327   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8328             UseAVX >= 2);
8329   match(Set dst (VectorRearrange src shuffle));
8330   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8331   ins_encode %{
8332     int vlen_enc = vector_length_encoding(this);
8333     if (vlen_enc == Assembler::AVX_128bit) {
8334       vlen_enc = Assembler::AVX_256bit;
8335     }
8336     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8337   %}
8338   ins_pipe( pipe_slow );
8339 %}
8340 
8341 // LoadShuffle/Rearrange for Long and Double
8342 
8343 instruct loadShuffleL(vec dst, vec src, vec vtmp, rRegP scratch) %{
8344   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8345             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
8346   match(Set dst (VectorLoadShuffle src));
8347   effect(TEMP dst, TEMP vtmp, TEMP scratch);
8348   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
8349   ins_encode %{
8350     assert(UseAVX >= 2, "required");
8351 
8352     int vlen_enc = vector_length_encoding(this);
8353     // Create a double word shuffle mask from long shuffle mask
8354     // only double word shuffle instruction available on these platforms
8355 
8356     // Multiply each shuffle by two to get double word index
8357     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
8358     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
8359 
8360     // Duplicate each double word shuffle
8361     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
8362     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8363 
8364     // Add one to get alternate double word index
8365     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, $scratch$$Register);
8366   %}
8367   ins_pipe( pipe_slow );
8368 %}
8369 
8370 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
8371   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8372             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
8373   match(Set dst (VectorRearrange src shuffle));
8374   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8375   ins_encode %{
8376     assert(UseAVX >= 2, "required");
8377 
8378     int vlen_enc = vector_length_encoding(this);
8379     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8380   %}
8381   ins_pipe( pipe_slow );
8382 %}
8383 
8384 instruct loadShuffleL_evex(vec dst, vec src) %{
8385   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8386             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
8387   match(Set dst (VectorLoadShuffle src));
8388   format %{ "vector_load_shuffle $dst, $src" %}
8389   ins_encode %{
8390     assert(UseAVX > 2, "required");
8391 
8392     int vlen_enc = vector_length_encoding(this);
8393     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8394   %}
8395   ins_pipe( pipe_slow );
8396 %}
8397 
8398 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
8399   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8400             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
8401   match(Set dst (VectorRearrange src shuffle));
8402   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8403   ins_encode %{
8404     assert(UseAVX > 2, "required");
8405 
8406     int vlen_enc = vector_length_encoding(this);
8407     if (vlen_enc == Assembler::AVX_128bit) {
8408       vlen_enc = Assembler::AVX_256bit;
8409     }
8410     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8411   %}
8412   ins_pipe( pipe_slow );
8413 %}
8414 
8415 // --------------------------------- FMA --------------------------------------
8416 // a * b + c
8417 
8418 instruct vfmaF_reg(vec a, vec b, vec c) %{
8419   match(Set c (FmaVF  c (Binary a b)));
8420   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8421   ins_cost(150);
8422   ins_encode %{
8423     assert(UseFMA, "not enabled");
8424     int vlen_enc = vector_length_encoding(this);
8425     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8426   %}
8427   ins_pipe( pipe_slow );
8428 %}
8429 
8430 instruct vfmaF_mem(vec a, memory b, vec c) %{
8431   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8432   match(Set c (FmaVF  c (Binary a (LoadVector b))));
8433   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8434   ins_cost(150);
8435   ins_encode %{
8436     assert(UseFMA, "not enabled");
8437     int vlen_enc = vector_length_encoding(this);
8438     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8439   %}
8440   ins_pipe( pipe_slow );
8441 %}
8442 
8443 instruct vfmaD_reg(vec a, vec b, vec c) %{
8444   match(Set c (FmaVD  c (Binary a b)));
8445   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8446   ins_cost(150);
8447   ins_encode %{
8448     assert(UseFMA, "not enabled");
8449     int vlen_enc = vector_length_encoding(this);
8450     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8451   %}
8452   ins_pipe( pipe_slow );
8453 %}
8454 
8455 instruct vfmaD_mem(vec a, memory b, vec c) %{
8456   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8457   match(Set c (FmaVD  c (Binary a (LoadVector b))));
8458   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8459   ins_cost(150);
8460   ins_encode %{
8461     assert(UseFMA, "not enabled");
8462     int vlen_enc = vector_length_encoding(this);
8463     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8464   %}
8465   ins_pipe( pipe_slow );
8466 %}
8467 
8468 // --------------------------------- Vector Multiply Add --------------------------------------
8469 
8470 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
8471   predicate(UseAVX == 0);
8472   match(Set dst (MulAddVS2VI dst src1));
8473   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
8474   ins_encode %{
8475     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
8476   %}
8477   ins_pipe( pipe_slow );
8478 %}
8479 
8480 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
8481   predicate(UseAVX > 0);
8482   match(Set dst (MulAddVS2VI src1 src2));
8483   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
8484   ins_encode %{
8485     int vlen_enc = vector_length_encoding(this);
8486     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8487   %}
8488   ins_pipe( pipe_slow );
8489 %}
8490 
8491 // --------------------------------- Vector Multiply Add Add ----------------------------------
8492 
8493 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
8494   predicate(VM_Version::supports_avx512_vnni());
8495   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
8496   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
8497   ins_encode %{
8498     assert(UseAVX > 2, "required");
8499     int vlen_enc = vector_length_encoding(this);
8500     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8501   %}
8502   ins_pipe( pipe_slow );
8503   ins_cost(10);
8504 %}
8505 
8506 // --------------------------------- PopCount --------------------------------------
8507 
8508 instruct vpopcountI(vec dst, vec src) %{
8509   match(Set dst (PopCountVI src));
8510   format %{ "vpopcntd  $dst,$src\t! vector popcount packedI" %}
8511   ins_encode %{
8512     assert(UsePopCountInstruction, "not enabled");
8513 
8514     int vlen_enc = vector_length_encoding(this);
8515     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8516   %}
8517   ins_pipe( pipe_slow );
8518 %}
8519 
8520 // --------------------------------- Bitwise Ternary Logic ----------------------------------
8521 
8522 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
8523   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
8524   effect(TEMP dst);
8525   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8526   ins_encode %{
8527     int vector_len = vector_length_encoding(this);
8528     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
8529   %}
8530   ins_pipe( pipe_slow );
8531 %}
8532 
8533 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
8534   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
8535   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
8536   effect(TEMP dst);
8537   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8538   ins_encode %{
8539     int vector_len = vector_length_encoding(this);
8540     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
8541   %}
8542   ins_pipe( pipe_slow );
8543 %}
8544 
8545 // --------------------------------- Rotation Operations ----------------------------------
8546 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
8547   match(Set dst (RotateLeftV src shift));
8548   match(Set dst (RotateRightV src shift));
8549   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
8550   ins_encode %{
8551     int opcode      = this->ideal_Opcode();
8552     int vector_len  = vector_length_encoding(this);
8553     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8554     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
8555   %}
8556   ins_pipe( pipe_slow );
8557 %}
8558 
8559 instruct vprorate(vec dst, vec src, vec shift) %{
8560   match(Set dst (RotateLeftV src shift));
8561   match(Set dst (RotateRightV src shift));
8562   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
8563   ins_encode %{
8564     int opcode      = this->ideal_Opcode();
8565     int vector_len  = vector_length_encoding(this);
8566     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8567     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8568   %}
8569   ins_pipe( pipe_slow );
8570 %}
8571 
8572 #ifdef _LP64
8573 // ---------------------------------- Masked Operations ------------------------------------
8574 
8575 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
8576   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
8577   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
8578   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
8579   ins_encode %{
8580     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
8581     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
8582 
8583     Label DONE;
8584     int vlen_enc = vector_length_encoding(this, $src1);
8585     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
8586 
8587     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
8588     __ mov64($dst$$Register, -1L);
8589     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
8590     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
8591     __ jccb(Assembler::carrySet, DONE);
8592     __ kmovql($dst$$Register, $ktmp1$$KRegister);
8593     __ notq($dst$$Register);
8594     __ tzcntq($dst$$Register, $dst$$Register);
8595     __ bind(DONE);
8596   %}
8597   ins_pipe( pipe_slow );
8598 %}
8599 
8600 
8601 instruct vmasked_load64(vec dst, memory mem, kReg mask) %{
8602   match(Set dst (LoadVectorMasked mem mask));
8603   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
8604   ins_encode %{
8605     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
8606     int vector_len = vector_length_encoding(this);
8607     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, vector_len);
8608   %}
8609   ins_pipe( pipe_slow );
8610 %}
8611 
8612 instruct vmask_gen(kReg dst, rRegL len, rRegL temp) %{
8613   match(Set dst (VectorMaskGen len));
8614   effect(TEMP temp);
8615   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
8616   ins_encode %{
8617     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
8618   %}
8619   ins_pipe( pipe_slow );
8620 %}
8621 
8622 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
8623   match(Set dst (VectorMaskGen len));
8624   format %{ "vector_mask_gen $len \t! vector mask generator" %}
8625   effect(TEMP temp);
8626   ins_encode %{
8627     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
8628     __ kmovql($dst$$KRegister, $temp$$Register);
8629   %}
8630   ins_pipe( pipe_slow );
8631 %}
8632 
8633 instruct vmasked_store64(memory mem, vec src, kReg mask) %{
8634   match(Set mem (StoreVectorMasked mem (Binary src mask)));
8635   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
8636   ins_encode %{
8637     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
8638     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
8639     int vector_len = vector_length_encoding(src_node);
8640     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, vector_len);
8641   %}
8642   ins_pipe( pipe_slow );
8643 %}
8644 
8645 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
8646   predicate(n->in(1)->bottom_type()->isa_vectmask());
8647   match(Set dst (VectorMaskToLong mask));
8648   effect(TEMP dst, KILL cr);
8649   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
8650   ins_encode %{
8651     int opcode = this->ideal_Opcode();
8652     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8653     int mask_len = Matcher::vector_length(this, $mask);
8654     int mask_size = mask_len * type2aelembytes(mbt);
8655     int vlen_enc = vector_length_encoding(this, $mask);
8656     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
8657                              $dst$$Register, mask_len, mask_size, vlen_enc);
8658   %}
8659   ins_pipe( pipe_slow );
8660 %}
8661 
8662 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
8663   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
8664   match(Set dst (VectorMaskToLong mask));
8665   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
8666   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
8667   ins_encode %{
8668     int opcode = this->ideal_Opcode();
8669     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8670     int mask_len = Matcher::vector_length(this, $mask);
8671     int vlen_enc = vector_length_encoding(this, $mask);
8672     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8673                              $dst$$Register, mask_len, mbt, vlen_enc);
8674   %}
8675   ins_pipe( pipe_slow );
8676 %}
8677 
8678 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
8679   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
8680   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
8681   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
8682   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
8683   ins_encode %{
8684     int opcode = this->ideal_Opcode();
8685     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8686     int mask_len = Matcher::vector_length(this, $mask);
8687     int vlen_enc = vector_length_encoding(this, $mask);
8688     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8689                              $dst$$Register, mask_len, mbt, vlen_enc);
8690   %}
8691   ins_pipe( pipe_slow );
8692 %}
8693 
8694 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
8695   predicate(n->in(1)->bottom_type()->isa_vectmask());
8696   match(Set dst (VectorMaskTrueCount mask));
8697   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
8698   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
8699   ins_encode %{
8700     int opcode = this->ideal_Opcode();
8701     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8702     int mask_len = Matcher::vector_length(this, $mask);
8703     int mask_size = mask_len * type2aelembytes(mbt);
8704     int vlen_enc = vector_length_encoding(this, $mask);
8705     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
8706                              $tmp$$Register, mask_len, mask_size, vlen_enc);
8707   %}
8708   ins_pipe( pipe_slow );
8709 %}
8710 
8711 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
8712   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
8713   match(Set dst (VectorMaskTrueCount mask));
8714   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
8715   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
8716   ins_encode %{
8717     int opcode = this->ideal_Opcode();
8718     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8719     int mask_len = Matcher::vector_length(this, $mask);
8720     int vlen_enc = vector_length_encoding(this, $mask);
8721     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8722                              $tmp$$Register, mask_len, mbt, vlen_enc);
8723   %}
8724   ins_pipe( pipe_slow );
8725 %}
8726 
8727 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
8728   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
8729   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
8730   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
8731   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
8732   ins_encode %{
8733     int opcode = this->ideal_Opcode();
8734     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8735     int mask_len = Matcher::vector_length(this, $mask);
8736     int vlen_enc = vector_length_encoding(this, $mask);
8737     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8738                              $tmp$$Register, mask_len, mbt, vlen_enc);
8739   %}
8740   ins_pipe( pipe_slow );
8741 %}
8742 
8743 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
8744   predicate(n->in(1)->bottom_type()->isa_vectmask());
8745   match(Set dst (VectorMaskFirstTrue mask));
8746   match(Set dst (VectorMaskLastTrue mask));
8747   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
8748   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
8749   ins_encode %{
8750     int opcode = this->ideal_Opcode();
8751     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8752     int mask_len = Matcher::vector_length(this, $mask);
8753     int mask_size = mask_len * type2aelembytes(mbt);
8754     int vlen_enc = vector_length_encoding(this, $mask);
8755     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
8756                              $tmp$$Register, mask_len, mask_size, vlen_enc);
8757   %}
8758   ins_pipe( pipe_slow );
8759 %}
8760 
8761 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
8762   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
8763   match(Set dst (VectorMaskFirstTrue mask));
8764   match(Set dst (VectorMaskLastTrue mask));
8765   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
8766   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
8767   ins_encode %{
8768     int opcode = this->ideal_Opcode();
8769     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8770     int mask_len = Matcher::vector_length(this, $mask);
8771     int vlen_enc = vector_length_encoding(this, $mask);
8772     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8773                              $tmp$$Register, mask_len, mbt, vlen_enc);
8774   %}
8775   ins_pipe( pipe_slow );
8776 %}
8777 
8778 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
8779   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
8780   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
8781   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
8782   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
8783   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
8784   ins_encode %{
8785     int opcode = this->ideal_Opcode();
8786     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8787     int mask_len = Matcher::vector_length(this, $mask);
8788     int vlen_enc = vector_length_encoding(this, $mask);
8789     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8790                              $tmp$$Register, mask_len, mbt, vlen_enc);
8791   %}
8792   ins_pipe( pipe_slow );
8793 %}
8794 #endif // _LP64
8795 
8796 // ---------------------------------- Vector Masked Operations ------------------------------------
8797 
8798 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
8799   match(Set dst (AddVB (Binary dst src2) mask));
8800   match(Set dst (AddVS (Binary dst src2) mask));
8801   match(Set dst (AddVI (Binary dst src2) mask));
8802   match(Set dst (AddVL (Binary dst src2) mask));
8803   match(Set dst (AddVF (Binary dst src2) mask));
8804   match(Set dst (AddVD (Binary dst src2) mask));
8805   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
8806   ins_encode %{
8807     int vlen_enc = vector_length_encoding(this);
8808     BasicType bt = Matcher::vector_element_basic_type(this);
8809     int opc = this->ideal_Opcode();
8810     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8811                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8812   %}
8813   ins_pipe( pipe_slow );
8814 %}
8815 
8816 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
8817   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
8818   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
8819   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
8820   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
8821   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
8822   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
8823   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
8824   ins_encode %{
8825     int vlen_enc = vector_length_encoding(this);
8826     BasicType bt = Matcher::vector_element_basic_type(this);
8827     int opc = this->ideal_Opcode();
8828     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8829                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8830   %}
8831   ins_pipe( pipe_slow );
8832 %}
8833 
8834 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
8835   match(Set dst (XorV (Binary dst src2) mask));
8836   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
8837   ins_encode %{
8838     int vlen_enc = vector_length_encoding(this);
8839     BasicType bt = Matcher::vector_element_basic_type(this);
8840     int opc = this->ideal_Opcode();
8841     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8842                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8843   %}
8844   ins_pipe( pipe_slow );
8845 %}
8846 
8847 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
8848   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
8849   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
8850   ins_encode %{
8851     int vlen_enc = vector_length_encoding(this);
8852     BasicType bt = Matcher::vector_element_basic_type(this);
8853     int opc = this->ideal_Opcode();
8854     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8855                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8856   %}
8857   ins_pipe( pipe_slow );
8858 %}
8859 
8860 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
8861   match(Set dst (OrV (Binary dst src2) mask));
8862   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
8863   ins_encode %{
8864     int vlen_enc = vector_length_encoding(this);
8865     BasicType bt = Matcher::vector_element_basic_type(this);
8866     int opc = this->ideal_Opcode();
8867     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8868                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8869   %}
8870   ins_pipe( pipe_slow );
8871 %}
8872 
8873 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
8874   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
8875   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
8876   ins_encode %{
8877     int vlen_enc = vector_length_encoding(this);
8878     BasicType bt = Matcher::vector_element_basic_type(this);
8879     int opc = this->ideal_Opcode();
8880     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8881                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8882   %}
8883   ins_pipe( pipe_slow );
8884 %}
8885 
8886 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
8887   match(Set dst (AndV (Binary dst src2) mask));
8888   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
8889   ins_encode %{
8890     int vlen_enc = vector_length_encoding(this);
8891     BasicType bt = Matcher::vector_element_basic_type(this);
8892     int opc = this->ideal_Opcode();
8893     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8894                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8895   %}
8896   ins_pipe( pipe_slow );
8897 %}
8898 
8899 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
8900   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
8901   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
8902   ins_encode %{
8903     int vlen_enc = vector_length_encoding(this);
8904     BasicType bt = Matcher::vector_element_basic_type(this);
8905     int opc = this->ideal_Opcode();
8906     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8907                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8908   %}
8909   ins_pipe( pipe_slow );
8910 %}
8911 
8912 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
8913   match(Set dst (SubVB (Binary dst src2) mask));
8914   match(Set dst (SubVS (Binary dst src2) mask));
8915   match(Set dst (SubVI (Binary dst src2) mask));
8916   match(Set dst (SubVL (Binary dst src2) mask));
8917   match(Set dst (SubVF (Binary dst src2) mask));
8918   match(Set dst (SubVD (Binary dst src2) mask));
8919   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
8920   ins_encode %{
8921     int vlen_enc = vector_length_encoding(this);
8922     BasicType bt = Matcher::vector_element_basic_type(this);
8923     int opc = this->ideal_Opcode();
8924     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8925                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8926   %}
8927   ins_pipe( pipe_slow );
8928 %}
8929 
8930 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
8931   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
8932   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
8933   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
8934   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
8935   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
8936   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
8937   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
8938   ins_encode %{
8939     int vlen_enc = vector_length_encoding(this);
8940     BasicType bt = Matcher::vector_element_basic_type(this);
8941     int opc = this->ideal_Opcode();
8942     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8943                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8944   %}
8945   ins_pipe( pipe_slow );
8946 %}
8947 
8948 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
8949   match(Set dst (MulVS (Binary dst src2) mask));
8950   match(Set dst (MulVI (Binary dst src2) mask));
8951   match(Set dst (MulVL (Binary dst src2) mask));
8952   match(Set dst (MulVF (Binary dst src2) mask));
8953   match(Set dst (MulVD (Binary dst src2) mask));
8954   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
8955   ins_encode %{
8956     int vlen_enc = vector_length_encoding(this);
8957     BasicType bt = Matcher::vector_element_basic_type(this);
8958     int opc = this->ideal_Opcode();
8959     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8960                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8961   %}
8962   ins_pipe( pipe_slow );
8963 %}
8964 
8965 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
8966   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
8967   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
8968   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
8969   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
8970   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
8971   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
8972   ins_encode %{
8973     int vlen_enc = vector_length_encoding(this);
8974     BasicType bt = Matcher::vector_element_basic_type(this);
8975     int opc = this->ideal_Opcode();
8976     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8977                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8978   %}
8979   ins_pipe( pipe_slow );
8980 %}
8981 
8982 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
8983   match(Set dst (SqrtVF dst mask));
8984   match(Set dst (SqrtVD dst mask));
8985   ins_cost(100);
8986   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
8987   ins_encode %{
8988     int vlen_enc = vector_length_encoding(this);
8989     BasicType bt = Matcher::vector_element_basic_type(this);
8990     int opc = this->ideal_Opcode();
8991     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8992                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
8993   %}
8994   ins_pipe( pipe_slow );
8995 %}
8996 
8997 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
8998   match(Set dst (DivVF (Binary dst src2) mask));
8999   match(Set dst (DivVD (Binary dst src2) mask));
9000   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
9001   ins_encode %{
9002     int vlen_enc = vector_length_encoding(this);
9003     BasicType bt = Matcher::vector_element_basic_type(this);
9004     int opc = this->ideal_Opcode();
9005     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9006                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9007   %}
9008   ins_pipe( pipe_slow );
9009 %}
9010 
9011 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
9012   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
9013   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
9014   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
9015   ins_encode %{
9016     int vlen_enc = vector_length_encoding(this);
9017     BasicType bt = Matcher::vector_element_basic_type(this);
9018     int opc = this->ideal_Opcode();
9019     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9020                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9021   %}
9022   ins_pipe( pipe_slow );
9023 %}
9024 
9025 
9026 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
9027   match(Set dst (RotateLeftV (Binary dst shift) mask));
9028   match(Set dst (RotateRightV (Binary dst shift) mask));
9029   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
9030   ins_encode %{
9031     int vlen_enc = vector_length_encoding(this);
9032     BasicType bt = Matcher::vector_element_basic_type(this);
9033     int opc = this->ideal_Opcode();
9034     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9035                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9036   %}
9037   ins_pipe( pipe_slow );
9038 %}
9039 
9040 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
9041   match(Set dst (RotateLeftV (Binary dst src2) mask));
9042   match(Set dst (RotateRightV (Binary dst src2) mask));
9043   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
9044   ins_encode %{
9045     int vlen_enc = vector_length_encoding(this);
9046     BasicType bt = Matcher::vector_element_basic_type(this);
9047     int opc = this->ideal_Opcode();
9048     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9049                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9050   %}
9051   ins_pipe( pipe_slow );
9052 %}
9053 
9054 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9055   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
9056   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
9057   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
9058   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
9059   ins_encode %{
9060     int vlen_enc = vector_length_encoding(this);
9061     BasicType bt = Matcher::vector_element_basic_type(this);
9062     int opc = this->ideal_Opcode();
9063     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9064                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9065   %}
9066   ins_pipe( pipe_slow );
9067 %}
9068 
9069 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
9070   predicate(!n->as_ShiftV()->is_var_shift());
9071   match(Set dst (LShiftVS (Binary dst src2) mask));
9072   match(Set dst (LShiftVI (Binary dst src2) mask));
9073   match(Set dst (LShiftVL (Binary dst src2) mask));
9074   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
9075   ins_encode %{
9076     int vlen_enc = vector_length_encoding(this);
9077     BasicType bt = Matcher::vector_element_basic_type(this);
9078     int opc = this->ideal_Opcode();
9079     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9080                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
9081   %}
9082   ins_pipe( pipe_slow );
9083 %}
9084 
9085 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
9086   predicate(n->as_ShiftV()->is_var_shift());
9087   match(Set dst (LShiftVS (Binary dst src2) mask));
9088   match(Set dst (LShiftVI (Binary dst src2) mask));
9089   match(Set dst (LShiftVL (Binary dst src2) mask));
9090   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
9091   ins_encode %{
9092     int vlen_enc = vector_length_encoding(this);
9093     BasicType bt = Matcher::vector_element_basic_type(this);
9094     int opc = this->ideal_Opcode();
9095     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9096                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
9097   %}
9098   ins_pipe( pipe_slow );
9099 %}
9100 
9101 instruct vlshift_mem_masked(vec dst, memory src2, kReg mask) %{
9102   match(Set dst (LShiftVS (Binary dst (LoadVector src2)) mask));
9103   match(Set dst (LShiftVI (Binary dst (LoadVector src2)) mask));
9104   match(Set dst (LShiftVL (Binary dst (LoadVector src2)) mask));
9105   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
9106   ins_encode %{
9107     int vlen_enc = vector_length_encoding(this);
9108     BasicType bt = Matcher::vector_element_basic_type(this);
9109     int opc = this->ideal_Opcode();
9110     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9111                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9112   %}
9113   ins_pipe( pipe_slow );
9114 %}
9115 
9116 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9117   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
9118   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
9119   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
9120   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
9121   ins_encode %{
9122     int vlen_enc = vector_length_encoding(this);
9123     BasicType bt = Matcher::vector_element_basic_type(this);
9124     int opc = this->ideal_Opcode();
9125     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9126                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9127   %}
9128   ins_pipe( pipe_slow );
9129 %}
9130 
9131 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
9132   predicate(!n->as_ShiftV()->is_var_shift());
9133   match(Set dst (RShiftVS (Binary dst src2) mask));
9134   match(Set dst (RShiftVI (Binary dst src2) mask));
9135   match(Set dst (RShiftVL (Binary dst src2) mask));
9136   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
9137   ins_encode %{
9138     int vlen_enc = vector_length_encoding(this);
9139     BasicType bt = Matcher::vector_element_basic_type(this);
9140     int opc = this->ideal_Opcode();
9141     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9142                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
9143   %}
9144   ins_pipe( pipe_slow );
9145 %}
9146 
9147 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
9148   predicate(n->as_ShiftV()->is_var_shift());
9149   match(Set dst (RShiftVS (Binary dst src2) mask));
9150   match(Set dst (RShiftVI (Binary dst src2) mask));
9151   match(Set dst (RShiftVL (Binary dst src2) mask));
9152   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
9153   ins_encode %{
9154     int vlen_enc = vector_length_encoding(this);
9155     BasicType bt = Matcher::vector_element_basic_type(this);
9156     int opc = this->ideal_Opcode();
9157     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9158                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
9159   %}
9160   ins_pipe( pipe_slow );
9161 %}
9162 
9163 instruct vrshift_mem_masked(vec dst, memory src2, kReg mask) %{
9164   match(Set dst (RShiftVS (Binary dst (LoadVector src2)) mask));
9165   match(Set dst (RShiftVI (Binary dst (LoadVector src2)) mask));
9166   match(Set dst (RShiftVL (Binary dst (LoadVector src2)) mask));
9167   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
9168   ins_encode %{
9169     int vlen_enc = vector_length_encoding(this);
9170     BasicType bt = Matcher::vector_element_basic_type(this);
9171     int opc = this->ideal_Opcode();
9172     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9173                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9174   %}
9175   ins_pipe( pipe_slow );
9176 %}
9177 
9178 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9179   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
9180   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
9181   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
9182   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
9183   ins_encode %{
9184     int vlen_enc = vector_length_encoding(this);
9185     BasicType bt = Matcher::vector_element_basic_type(this);
9186     int opc = this->ideal_Opcode();
9187     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9188                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9189   %}
9190   ins_pipe( pipe_slow );
9191 %}
9192 
9193 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
9194   predicate(!n->as_ShiftV()->is_var_shift());
9195   match(Set dst (URShiftVS (Binary dst src2) mask));
9196   match(Set dst (URShiftVI (Binary dst src2) mask));
9197   match(Set dst (URShiftVL (Binary dst src2) mask));
9198   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
9199   ins_encode %{
9200     int vlen_enc = vector_length_encoding(this);
9201     BasicType bt = Matcher::vector_element_basic_type(this);
9202     int opc = this->ideal_Opcode();
9203     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9204                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
9205   %}
9206   ins_pipe( pipe_slow );
9207 %}
9208 
9209 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
9210   predicate(n->as_ShiftV()->is_var_shift());
9211   match(Set dst (URShiftVS (Binary dst src2) mask));
9212   match(Set dst (URShiftVI (Binary dst src2) mask));
9213   match(Set dst (URShiftVL (Binary dst src2) mask));
9214   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
9215   ins_encode %{
9216     int vlen_enc = vector_length_encoding(this);
9217     BasicType bt = Matcher::vector_element_basic_type(this);
9218     int opc = this->ideal_Opcode();
9219     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9220                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
9221   %}
9222   ins_pipe( pipe_slow );
9223 %}
9224 
9225 instruct vurshift_mem_masked(vec dst, memory src2, kReg mask) %{
9226   match(Set dst (URShiftVS (Binary dst (LoadVector src2)) mask));
9227   match(Set dst (URShiftVI (Binary dst (LoadVector src2)) mask));
9228   match(Set dst (URShiftVL (Binary dst (LoadVector src2)) mask));
9229   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
9230   ins_encode %{
9231     int vlen_enc = vector_length_encoding(this);
9232     BasicType bt = Matcher::vector_element_basic_type(this);
9233     int opc = this->ideal_Opcode();
9234     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9235                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9236   %}
9237   ins_pipe( pipe_slow );
9238 %}
9239 
9240 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
9241   match(Set dst (MaxV (Binary dst src2) mask));
9242   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
9243   ins_encode %{
9244     int vlen_enc = vector_length_encoding(this);
9245     BasicType bt = Matcher::vector_element_basic_type(this);
9246     int opc = this->ideal_Opcode();
9247     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9248                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9249   %}
9250   ins_pipe( pipe_slow );
9251 %}
9252 
9253 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
9254   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
9255   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
9256   ins_encode %{
9257     int vlen_enc = vector_length_encoding(this);
9258     BasicType bt = Matcher::vector_element_basic_type(this);
9259     int opc = this->ideal_Opcode();
9260     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9261                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9262   %}
9263   ins_pipe( pipe_slow );
9264 %}
9265 
9266 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
9267   match(Set dst (MinV (Binary dst src2) mask));
9268   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
9269   ins_encode %{
9270     int vlen_enc = vector_length_encoding(this);
9271     BasicType bt = Matcher::vector_element_basic_type(this);
9272     int opc = this->ideal_Opcode();
9273     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9274                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9275   %}
9276   ins_pipe( pipe_slow );
9277 %}
9278 
9279 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
9280   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
9281   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
9282   ins_encode %{
9283     int vlen_enc = vector_length_encoding(this);
9284     BasicType bt = Matcher::vector_element_basic_type(this);
9285     int opc = this->ideal_Opcode();
9286     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9287                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9288   %}
9289   ins_pipe( pipe_slow );
9290 %}
9291 
9292 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
9293   match(Set dst (VectorRearrange (Binary dst src2) mask));
9294   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
9295   ins_encode %{
9296     int vlen_enc = vector_length_encoding(this);
9297     BasicType bt = Matcher::vector_element_basic_type(this);
9298     int opc = this->ideal_Opcode();
9299     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9300                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
9301   %}
9302   ins_pipe( pipe_slow );
9303 %}
9304 
9305 instruct vabs_masked(vec dst, kReg mask) %{
9306   match(Set dst (AbsVB dst mask));
9307   match(Set dst (AbsVS dst mask));
9308   match(Set dst (AbsVI dst mask));
9309   match(Set dst (AbsVL dst mask));
9310   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
9311   ins_cost(100);
9312   ins_encode %{
9313     int vlen_enc = vector_length_encoding(this);
9314     BasicType bt = Matcher::vector_element_basic_type(this);
9315     int opc = this->ideal_Opcode();
9316     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9317                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
9318   %}
9319   ins_pipe( pipe_slow );
9320 %}
9321 
9322 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
9323   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
9324   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
9325   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
9326   ins_encode %{
9327     int vlen_enc = vector_length_encoding(this);
9328     BasicType bt = Matcher::vector_element_basic_type(this);
9329     int opc = this->ideal_Opcode();
9330     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9331                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
9332   %}
9333   ins_pipe( pipe_slow );
9334 %}
9335 
9336 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
9337   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
9338   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
9339   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
9340   ins_encode %{
9341     int vlen_enc = vector_length_encoding(this);
9342     BasicType bt = Matcher::vector_element_basic_type(this);
9343     int opc = this->ideal_Opcode();
9344     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9345                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
9346   %}
9347   ins_pipe( pipe_slow );
9348 %}
9349 
9350 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask, rRegP scratch) %{
9351   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
9352   effect(TEMP scratch);
9353   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask\t! using $scratch as TEMP" %}
9354   ins_encode %{
9355     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
9356     int vlen_enc = vector_length_encoding(this, $src1);
9357     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
9358 
9359     // Comparison i
9360     switch (src1_elem_bt) {
9361       case T_BYTE: {
9362         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9363         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9364         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9365         break;
9366       }
9367       case T_SHORT: {
9368         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9369         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9370         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9371         break;
9372       }
9373       case T_INT: {
9374         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9375         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9376         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9377         break;
9378       }
9379       case T_LONG: {
9380         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9381         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9382         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9383         break;
9384       }
9385       case T_FLOAT: {
9386         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
9387         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
9388         break;
9389       }
9390       case T_DOUBLE: {
9391         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
9392         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
9393         break;
9394       }
9395       default: assert(false, "%s", type2name(src1_elem_bt)); break;
9396     }
9397   %}
9398   ins_pipe( pipe_slow );
9399 %}
9400 
9401 #ifdef _LP64
9402 instruct mask_all_evexI_imm(kReg dst, immI cnt, rRegL tmp) %{
9403   match(Set dst (MaskAll cnt));
9404   effect(TEMP_DEF dst, TEMP tmp);
9405   format %{ "mask_all_evexI $dst, $cnt \t! using $tmp as TEMP" %}
9406   ins_encode %{
9407     int vec_len = Matcher::vector_length(this);
9408     if (VM_Version::supports_avx512bw()) {
9409       __ movq($tmp$$Register, $cnt$$constant);
9410       __ kmovql($dst$$KRegister, $tmp$$Register);
9411       __ kshiftrql($dst$$KRegister, $dst$$KRegister, 64 - vec_len);
9412     } else {
9413       assert(vec_len <= 16, "");
9414       __ movq($tmp$$Register, $cnt$$constant);
9415       __ kmovwl($dst$$KRegister, $tmp$$Register);
9416       __ kshiftrwl($dst$$KRegister, $dst$$KRegister, 16 - vec_len);
9417     }
9418   %}
9419   ins_pipe( pipe_slow );
9420 %}
9421 
9422 instruct mask_all_evexI(kReg dst, rRegI src, rRegL tmp) %{
9423   match(Set dst (MaskAll src));
9424   effect(TEMP_DEF dst, TEMP tmp);
9425   format %{ "mask_all_evexI $dst, $src \t! using $tmp as TEMP" %}
9426   ins_encode %{
9427     int vec_len = Matcher::vector_length(this);
9428     if (VM_Version::supports_avx512bw()) {
9429       __ movslq($tmp$$Register, $src$$Register);
9430       __ kmovql($dst$$KRegister, $tmp$$Register);
9431       __ kshiftrql($dst$$KRegister, $dst$$KRegister, 64 - vec_len);
9432     } else {
9433       assert(vec_len <= 16, "");
9434       __ kmovwl($dst$$KRegister, $src$$Register);
9435       __ kshiftrwl($dst$$KRegister, $dst$$KRegister, 16 - vec_len);
9436     }
9437   %}
9438   ins_pipe( pipe_slow );
9439 %}
9440 
9441 instruct mask_all_evexL(kReg dst, rRegL src) %{
9442   match(Set dst (MaskAll src));
9443   effect(TEMP_DEF dst);
9444   format %{ "mask_all_evexL $dst, $src \t! mask all operation" %}
9445   ins_encode %{
9446     int vec_len = Matcher::vector_length(this);
9447     if (VM_Version::supports_avx512bw()) {
9448       __ kmovql($dst$$KRegister, $src$$Register);
9449       __ kshiftrql($dst$$KRegister, $dst$$KRegister, 64 - vec_len);
9450     } else {
9451       assert(vec_len <= 16, "");
9452       __ kmovwl($dst$$KRegister, $src$$Register);
9453       __ kshiftrwl($dst$$KRegister, $dst$$KRegister, 16 - vec_len);
9454     }
9455   %}
9456   ins_pipe( pipe_slow );
9457 %}
9458 
9459 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
9460   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
9461   match(Set dst (XorVMask src (MaskAll cnt)));
9462   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
9463   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
9464   ins_encode %{
9465     uint masklen = Matcher::vector_length(this);
9466     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
9467   %}
9468   ins_pipe( pipe_slow );
9469 %}
9470 
9471 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
9472   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
9473             (Matcher::vector_length(n) == 16) ||
9474             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
9475   match(Set dst (XorVMask src (MaskAll cnt)));
9476   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
9477   ins_encode %{
9478     uint masklen = Matcher::vector_length(this);
9479     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
9480   %}
9481   ins_pipe( pipe_slow );
9482 %}
9483 #endif
9484 
9485 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
9486   match(Set dst (AndVMask src1 src2));
9487   match(Set dst (OrVMask src1 src2));
9488   match(Set dst (XorVMask src1 src2));
9489   effect(TEMP kscratch);
9490   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
9491   ins_encode %{
9492     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
9493     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
9494     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
9495     uint masklen = Matcher::vector_length(this);
9496     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
9497     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
9498   %}
9499   ins_pipe( pipe_slow );
9500 %}
9501 
9502 instruct castMM(kReg dst)
9503 %{
9504   match(Set dst (CastVV dst));
9505 
9506   size(0);
9507   format %{ "# castVV of $dst" %}
9508   ins_encode(/* empty encoding */);
9509   ins_cost(0);
9510   ins_pipe(empty);
9511 %}
9512 
9513 instruct castVV(vec dst)
9514 %{
9515   match(Set dst (CastVV dst));
9516 
9517   size(0);
9518   format %{ "# castVV of $dst" %}
9519   ins_encode(/* empty encoding */);
9520   ins_cost(0);
9521   ins_pipe(empty);
9522 %}
9523 
9524 instruct castVVLeg(legVec dst)
9525 %{
9526   match(Set dst (CastVV dst));
9527 
9528   size(0);
9529   format %{ "# castVV of $dst" %}
9530   ins_encode(/* empty encoding */);
9531   ins_cost(0);
9532   ins_pipe(empty);
9533 %}
--- EOF ---