1 //
   2 // Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // architecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 // AVX3 Mask Registers.
 632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
 633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
 634 
 635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
 636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
 637 
 638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
 639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
 640 
 641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
 642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
 643 
 644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
 645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
 646 
 647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
 648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
 649 
 650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
 651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
 652 
 653 
 654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 662 #ifdef _LP64
 663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 687 #endif
 688                       );
 689 
 690 alloc_class chunk2(K7, K7_H,
 691                    K6, K6_H,
 692                    K5, K5_H,
 693                    K4, K4_H,
 694                    K3, K3_H,
 695                    K2, K2_H,
 696                    K1, K1_H);
 697 
 698 reg_class  vectmask_reg(K1, K1_H,
 699                         K2, K2_H,
 700                         K3, K3_H,
 701                         K4, K4_H,
 702                         K5, K5_H,
 703                         K6, K6_H,
 704                         K7, K7_H);
 705 
 706 reg_class vectmask_reg_K1(K1, K1_H);
 707 reg_class vectmask_reg_K2(K2, K2_H);
 708 reg_class vectmask_reg_K3(K3, K3_H);
 709 reg_class vectmask_reg_K4(K4, K4_H);
 710 reg_class vectmask_reg_K5(K5, K5_H);
 711 reg_class vectmask_reg_K6(K6, K6_H);
 712 reg_class vectmask_reg_K7(K7, K7_H);
 713 
 714 // flags allocation class should be last.
 715 alloc_class chunk3(RFLAGS);
 716 
 717 
 718 // Singleton class for condition codes
 719 reg_class int_flags(RFLAGS);
 720 
 721 // Class for pre evex float registers
 722 reg_class float_reg_legacy(XMM0,
 723                     XMM1,
 724                     XMM2,
 725                     XMM3,
 726                     XMM4,
 727                     XMM5,
 728                     XMM6,
 729                     XMM7
 730 #ifdef _LP64
 731                    ,XMM8,
 732                     XMM9,
 733                     XMM10,
 734                     XMM11,
 735                     XMM12,
 736                     XMM13,
 737                     XMM14,
 738                     XMM15
 739 #endif
 740                     );
 741 
 742 // Class for evex float registers
 743 reg_class float_reg_evex(XMM0,
 744                     XMM1,
 745                     XMM2,
 746                     XMM3,
 747                     XMM4,
 748                     XMM5,
 749                     XMM6,
 750                     XMM7
 751 #ifdef _LP64
 752                    ,XMM8,
 753                     XMM9,
 754                     XMM10,
 755                     XMM11,
 756                     XMM12,
 757                     XMM13,
 758                     XMM14,
 759                     XMM15,
 760                     XMM16,
 761                     XMM17,
 762                     XMM18,
 763                     XMM19,
 764                     XMM20,
 765                     XMM21,
 766                     XMM22,
 767                     XMM23,
 768                     XMM24,
 769                     XMM25,
 770                     XMM26,
 771                     XMM27,
 772                     XMM28,
 773                     XMM29,
 774                     XMM30,
 775                     XMM31
 776 #endif
 777                     );
 778 
 779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 781 
 782 // Class for pre evex double registers
 783 reg_class double_reg_legacy(XMM0,  XMM0b,
 784                      XMM1,  XMM1b,
 785                      XMM2,  XMM2b,
 786                      XMM3,  XMM3b,
 787                      XMM4,  XMM4b,
 788                      XMM5,  XMM5b,
 789                      XMM6,  XMM6b,
 790                      XMM7,  XMM7b
 791 #ifdef _LP64
 792                     ,XMM8,  XMM8b,
 793                      XMM9,  XMM9b,
 794                      XMM10, XMM10b,
 795                      XMM11, XMM11b,
 796                      XMM12, XMM12b,
 797                      XMM13, XMM13b,
 798                      XMM14, XMM14b,
 799                      XMM15, XMM15b
 800 #endif
 801                      );
 802 
 803 // Class for evex double registers
 804 reg_class double_reg_evex(XMM0,  XMM0b,
 805                      XMM1,  XMM1b,
 806                      XMM2,  XMM2b,
 807                      XMM3,  XMM3b,
 808                      XMM4,  XMM4b,
 809                      XMM5,  XMM5b,
 810                      XMM6,  XMM6b,
 811                      XMM7,  XMM7b
 812 #ifdef _LP64
 813                     ,XMM8,  XMM8b,
 814                      XMM9,  XMM9b,
 815                      XMM10, XMM10b,
 816                      XMM11, XMM11b,
 817                      XMM12, XMM12b,
 818                      XMM13, XMM13b,
 819                      XMM14, XMM14b,
 820                      XMM15, XMM15b,
 821                      XMM16, XMM16b,
 822                      XMM17, XMM17b,
 823                      XMM18, XMM18b,
 824                      XMM19, XMM19b,
 825                      XMM20, XMM20b,
 826                      XMM21, XMM21b,
 827                      XMM22, XMM22b,
 828                      XMM23, XMM23b,
 829                      XMM24, XMM24b,
 830                      XMM25, XMM25b,
 831                      XMM26, XMM26b,
 832                      XMM27, XMM27b,
 833                      XMM28, XMM28b,
 834                      XMM29, XMM29b,
 835                      XMM30, XMM30b,
 836                      XMM31, XMM31b
 837 #endif
 838                      );
 839 
 840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 842 
 843 // Class for pre evex 32bit vector registers
 844 reg_class vectors_reg_legacy(XMM0,
 845                       XMM1,
 846                       XMM2,
 847                       XMM3,
 848                       XMM4,
 849                       XMM5,
 850                       XMM6,
 851                       XMM7
 852 #ifdef _LP64
 853                      ,XMM8,
 854                       XMM9,
 855                       XMM10,
 856                       XMM11,
 857                       XMM12,
 858                       XMM13,
 859                       XMM14,
 860                       XMM15
 861 #endif
 862                       );
 863 
 864 // Class for evex 32bit vector registers
 865 reg_class vectors_reg_evex(XMM0,
 866                       XMM1,
 867                       XMM2,
 868                       XMM3,
 869                       XMM4,
 870                       XMM5,
 871                       XMM6,
 872                       XMM7
 873 #ifdef _LP64
 874                      ,XMM8,
 875                       XMM9,
 876                       XMM10,
 877                       XMM11,
 878                       XMM12,
 879                       XMM13,
 880                       XMM14,
 881                       XMM15,
 882                       XMM16,
 883                       XMM17,
 884                       XMM18,
 885                       XMM19,
 886                       XMM20,
 887                       XMM21,
 888                       XMM22,
 889                       XMM23,
 890                       XMM24,
 891                       XMM25,
 892                       XMM26,
 893                       XMM27,
 894                       XMM28,
 895                       XMM29,
 896                       XMM30,
 897                       XMM31
 898 #endif
 899                       );
 900 
 901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 903 
 904 // Class for all 64bit vector registers
 905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 906                       XMM1,  XMM1b,
 907                       XMM2,  XMM2b,
 908                       XMM3,  XMM3b,
 909                       XMM4,  XMM4b,
 910                       XMM5,  XMM5b,
 911                       XMM6,  XMM6b,
 912                       XMM7,  XMM7b
 913 #ifdef _LP64
 914                      ,XMM8,  XMM8b,
 915                       XMM9,  XMM9b,
 916                       XMM10, XMM10b,
 917                       XMM11, XMM11b,
 918                       XMM12, XMM12b,
 919                       XMM13, XMM13b,
 920                       XMM14, XMM14b,
 921                       XMM15, XMM15b
 922 #endif
 923                       );
 924 
 925 // Class for all 64bit vector registers
 926 reg_class vectord_reg_evex(XMM0,  XMM0b,
 927                       XMM1,  XMM1b,
 928                       XMM2,  XMM2b,
 929                       XMM3,  XMM3b,
 930                       XMM4,  XMM4b,
 931                       XMM5,  XMM5b,
 932                       XMM6,  XMM6b,
 933                       XMM7,  XMM7b
 934 #ifdef _LP64
 935                      ,XMM8,  XMM8b,
 936                       XMM9,  XMM9b,
 937                       XMM10, XMM10b,
 938                       XMM11, XMM11b,
 939                       XMM12, XMM12b,
 940                       XMM13, XMM13b,
 941                       XMM14, XMM14b,
 942                       XMM15, XMM15b,
 943                       XMM16, XMM16b,
 944                       XMM17, XMM17b,
 945                       XMM18, XMM18b,
 946                       XMM19, XMM19b,
 947                       XMM20, XMM20b,
 948                       XMM21, XMM21b,
 949                       XMM22, XMM22b,
 950                       XMM23, XMM23b,
 951                       XMM24, XMM24b,
 952                       XMM25, XMM25b,
 953                       XMM26, XMM26b,
 954                       XMM27, XMM27b,
 955                       XMM28, XMM28b,
 956                       XMM29, XMM29b,
 957                       XMM30, XMM30b,
 958                       XMM31, XMM31b
 959 #endif
 960                       );
 961 
 962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 964 
 965 // Class for all 128bit vector registers
 966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 973                       XMM7,  XMM7b,  XMM7c,  XMM7d
 974 #ifdef _LP64
 975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 977                       XMM10, XMM10b, XMM10c, XMM10d,
 978                       XMM11, XMM11b, XMM11c, XMM11d,
 979                       XMM12, XMM12b, XMM12c, XMM12d,
 980                       XMM13, XMM13b, XMM13c, XMM13d,
 981                       XMM14, XMM14b, XMM14c, XMM14d,
 982                       XMM15, XMM15b, XMM15c, XMM15d
 983 #endif
 984                       );
 985 
 986 // Class for all 128bit vector registers
 987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 994                       XMM7,  XMM7b,  XMM7c,  XMM7d
 995 #ifdef _LP64
 996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 998                       XMM10, XMM10b, XMM10c, XMM10d,
 999                       XMM11, XMM11b, XMM11c, XMM11d,
1000                       XMM12, XMM12b, XMM12c, XMM12d,
1001                       XMM13, XMM13b, XMM13c, XMM13d,
1002                       XMM14, XMM14b, XMM14c, XMM14d,
1003                       XMM15, XMM15b, XMM15c, XMM15d,
1004                       XMM16, XMM16b, XMM16c, XMM16d,
1005                       XMM17, XMM17b, XMM17c, XMM17d,
1006                       XMM18, XMM18b, XMM18c, XMM18d,
1007                       XMM19, XMM19b, XMM19c, XMM19d,
1008                       XMM20, XMM20b, XMM20c, XMM20d,
1009                       XMM21, XMM21b, XMM21c, XMM21d,
1010                       XMM22, XMM22b, XMM22c, XMM22d,
1011                       XMM23, XMM23b, XMM23c, XMM23d,
1012                       XMM24, XMM24b, XMM24c, XMM24d,
1013                       XMM25, XMM25b, XMM25c, XMM25d,
1014                       XMM26, XMM26b, XMM26c, XMM26d,
1015                       XMM27, XMM27b, XMM27c, XMM27d,
1016                       XMM28, XMM28b, XMM28c, XMM28d,
1017                       XMM29, XMM29b, XMM29c, XMM29d,
1018                       XMM30, XMM30b, XMM30c, XMM30d,
1019                       XMM31, XMM31b, XMM31c, XMM31d
1020 #endif
1021                       );
1022 
1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1025 
1026 // Class for all 256bit vector registers
1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1035 #ifdef _LP64
1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
1044 #endif
1045                       );
1046 
1047 // Class for all 256bit vector registers
1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1056 #ifdef _LP64
1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1081 #endif
1082                       );
1083 
1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1086 
1087 // Class for all 512bit vector registers
1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1096 #ifdef _LP64
1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1121 #endif
1122                       );
1123 
1124 // Class for restricted 512bit vector registers
1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1133 #ifdef _LP64
1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1142 #endif
1143                       );
1144 
1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1147 
1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1149 %}
1150 
1151 
1152 //----------SOURCE BLOCK-------------------------------------------------------
1153 // This is a block of C++ code which provides values, functions, and
1154 // definitions necessary in the rest of the architecture description
1155 
1156 source_hpp %{
1157 // Header information of the source block.
1158 // Method declarations/definitions which are used outside
1159 // the ad-scope can conveniently be defined here.
1160 //
1161 // To keep related declarations/definitions/uses close together,
1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1163 
1164 #include "runtime/vm_version.hpp"
1165 
1166 class NativeJump;
1167 
1168 class CallStubImpl {
1169 
1170   //--------------------------------------------------------------
1171   //---<  Used for optimization in Compile::shorten_branches  >---
1172   //--------------------------------------------------------------
1173 
1174  public:
1175   // Size of call trampoline stub.
1176   static uint size_call_trampoline() {
1177     return 0; // no call trampolines on this platform
1178   }
1179 
1180   // number of relocations needed by a call trampoline stub
1181   static uint reloc_call_trampoline() {
1182     return 0; // no call trampolines on this platform
1183   }
1184 };
1185 
1186 class HandlerImpl {
1187 
1188  public:
1189 
1190   static int emit_exception_handler(CodeBuffer &cbuf);
1191   static int emit_deopt_handler(CodeBuffer& cbuf);
1192 
1193   static uint size_exception_handler() {
1194     // NativeCall instruction size is the same as NativeJump.
1195     // exception handler starts out as jump and can be patched to
1196     // a call be deoptimization.  (4932387)
1197     // Note that this value is also credited (in output.cpp) to
1198     // the size of the code section.
1199     return NativeJump::instruction_size;
1200   }
1201 
1202 #ifdef _LP64
1203   static uint size_deopt_handler() {
1204     // three 5 byte instructions plus one move for unreachable address.
1205     return 15+3;
1206   }
1207 #else
1208   static uint size_deopt_handler() {
1209     // NativeCall instruction size is the same as NativeJump.
1210     // exception handler starts out as jump and can be patched to
1211     // a call be deoptimization.  (4932387)
1212     // Note that this value is also credited (in output.cpp) to
1213     // the size of the code section.
1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1215   }
1216 #endif
1217 };
1218 
1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
1220   switch(bytes) {
1221     case  4: // fall-through
1222     case  8: // fall-through
1223     case 16: return Assembler::AVX_128bit;
1224     case 32: return Assembler::AVX_256bit;
1225     case 64: return Assembler::AVX_512bit;
1226 
1227     default: {
1228       ShouldNotReachHere();
1229       return Assembler::AVX_NoVec;
1230     }
1231   }
1232 }
1233 
1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
1236 }
1237 
1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
1239   uint def_idx = use->operand_index(opnd);
1240   Node* def = use->in(def_idx);
1241   return vector_length_encoding(def);
1242 }
1243 
1244 static inline bool is_unsigned_booltest_pred(int bt) {
1245   return  ((bt & BoolTest::unsigned_compare) == BoolTest::unsigned_compare);
1246 }
1247 
1248 class Node::PD {
1249 public:
1250   enum NodeFlags {
1251     Flag_intel_jcc_erratum = Node::_last_flag << 1,
1252     _last_flag             = Flag_intel_jcc_erratum
1253   };
1254 };
1255 
1256 %} // end source_hpp
1257 
1258 source %{
1259 
1260 #include "opto/addnode.hpp"
1261 #include "c2_intelJccErratum_x86.hpp"
1262 
1263 void PhaseOutput::pd_perform_mach_node_analysis() {
1264   if (VM_Version::has_intel_jcc_erratum()) {
1265     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
1266     _buf_sizes._code += extra_padding;
1267   }
1268 }
1269 
1270 int MachNode::pd_alignment_required() const {
1271   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
1272     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
1273     return IntelJccErratum::largest_jcc_size() + 1;
1274   } else {
1275     return 1;
1276   }
1277 }
1278 
1279 int MachNode::compute_padding(int current_offset) const {
1280   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
1281     Compile* C = Compile::current();
1282     PhaseOutput* output = C->output();
1283     Block* block = output->block();
1284     int index = output->index();
1285     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
1286   } else {
1287     return 0;
1288   }
1289 }
1290 
1291 // Emit exception handler code.
1292 // Stuff framesize into a register and call a VM stub routine.
1293 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1294 
1295   // Note that the code buffer's insts_mark is always relative to insts.
1296   // That's why we must use the macroassembler to generate a handler.
1297   C2_MacroAssembler _masm(&cbuf);
1298   address base = __ start_a_stub(size_exception_handler());
1299   if (base == NULL) {
1300     ciEnv::current()->record_failure("CodeCache is full");
1301     return 0;  // CodeBuffer::expand failed
1302   }
1303   int offset = __ offset();
1304   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1305   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1306   __ end_a_stub();
1307   return offset;
1308 }
1309 
1310 // Emit deopt handler code.
1311 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1312 
1313   // Note that the code buffer's insts_mark is always relative to insts.
1314   // That's why we must use the macroassembler to generate a handler.
1315   C2_MacroAssembler _masm(&cbuf);
1316   address base = __ start_a_stub(size_deopt_handler());
1317   if (base == NULL) {
1318     ciEnv::current()->record_failure("CodeCache is full");
1319     return 0;  // CodeBuffer::expand failed
1320   }
1321   int offset = __ offset();
1322 
1323 #ifdef _LP64
1324   address the_pc = (address) __ pc();
1325   Label next;
1326   // push a "the_pc" on the stack without destroying any registers
1327   // as they all may be live.
1328 
1329   // push address of "next"
1330   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1331   __ bind(next);
1332   // adjust it so it matches "the_pc"
1333   __ subptr(Address(rsp, 0), __ offset() - offset);
1334 #else
1335   InternalAddress here(__ pc());
1336   __ pushptr(here.addr());
1337 #endif
1338 
1339   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1340   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1341   __ end_a_stub();
1342   return offset;
1343 }
1344 
1345 Assembler::Width widthForType(BasicType bt) {
1346   if (bt == T_BYTE) {
1347     return Assembler::B;
1348   } else if (bt == T_SHORT) {
1349     return Assembler::W;
1350   } else if (bt == T_INT) {
1351     return Assembler::D;
1352   } else {
1353     assert(bt == T_LONG, "not a long: %s", type2name(bt));
1354     return Assembler::Q;
1355   }
1356 }
1357 
1358 //=============================================================================
1359 
1360   // Float masks come from different places depending on platform.
1361 #ifdef _LP64
1362   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1363   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1364   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1365   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1366 #else
1367   static address float_signmask()  { return (address)float_signmask_pool; }
1368   static address float_signflip()  { return (address)float_signflip_pool; }
1369   static address double_signmask() { return (address)double_signmask_pool; }
1370   static address double_signflip() { return (address)double_signflip_pool; }
1371 #endif
1372   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1373   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1374   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1375   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1376   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1377   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
1378   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1379   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
1380   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1381   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1382   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1383   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1384   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1385   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
1386   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
1387 
1388 //=============================================================================
1389 const bool Matcher::match_rule_supported(int opcode) {
1390   if (!has_match_rule(opcode)) {
1391     return false; // no match rule present
1392   }
1393   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1394   switch (opcode) {
1395     case Op_AbsVL:
1396     case Op_StoreVectorScatter:
1397       if (UseAVX < 3) {
1398         return false;
1399       }
1400       break;
1401     case Op_PopCountI:
1402     case Op_PopCountL:
1403       if (!UsePopCountInstruction) {
1404         return false;
1405       }
1406       break;
1407     case Op_PopCountVI:
1408       if (!UsePopCountInstruction || (UseAVX < 2)) {
1409         return false;
1410       }
1411       break;
1412     case Op_PopCountVL:
1413       if (!UsePopCountInstruction || (UseAVX <= 2)) {
1414         return false;
1415       }
1416       break;
1417     case Op_MulVI:
1418       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
1419         return false;
1420       }
1421       break;
1422     case Op_MulVL:
1423       if (UseSSE < 4) { // only with SSE4_1 or AVX
1424         return false;
1425       }
1426       break;
1427     case Op_MulReductionVL:
1428       if (VM_Version::supports_avx512dq() == false) {
1429         return false;
1430       }
1431       break;
1432     case Op_AddReductionVL:
1433       if (UseSSE < 2) { // requires at least SSE2
1434         return false;
1435       }
1436       break;
1437     case Op_AbsVB:
1438     case Op_AbsVS:
1439     case Op_AbsVI:
1440     case Op_AddReductionVI:
1441     case Op_AndReductionV:
1442     case Op_OrReductionV:
1443     case Op_XorReductionV:
1444       if (UseSSE < 3) { // requires at least SSSE3
1445         return false;
1446       }
1447       break;
1448     case Op_VectorLoadShuffle:
1449     case Op_VectorRearrange:
1450     case Op_MulReductionVI:
1451       if (UseSSE < 4) { // requires at least SSE4
1452         return false;
1453       }
1454       break;
1455     case Op_SqrtVD:
1456     case Op_SqrtVF:
1457     case Op_VectorMaskCmp:
1458     case Op_VectorCastB2X:
1459     case Op_VectorCastS2X:
1460     case Op_VectorCastI2X:
1461     case Op_VectorCastL2X:
1462     case Op_VectorCastF2X:
1463     case Op_VectorCastD2X:
1464     case Op_VectorUCastB2X:
1465     case Op_VectorUCastS2X:
1466     case Op_VectorUCastI2X:
1467       if (UseAVX < 1) { // enabled for AVX only
1468         return false;
1469       }
1470       break;
1471     case Op_RoundVF:
1472       if (UseAVX < 2) { // enabled for AVX2 only
1473         return false;
1474       }
1475       break;
1476     case Op_RoundVD:
1477       if (UseAVX < 3) {
1478         return false;  // enabled for AVX3 only
1479       }
1480       break;
1481     case Op_CompareAndSwapL:
1482 #ifdef _LP64
1483     case Op_CompareAndSwapP:
1484 #endif
1485       if (!VM_Version::supports_cx8()) {
1486         return false;
1487       }
1488       break;
1489     case Op_CMoveVF:
1490     case Op_CMoveVD:
1491       if (UseAVX < 1) { // enabled for AVX only
1492         return false;
1493       }
1494       break;
1495     case Op_StrIndexOf:
1496       if (!UseSSE42Intrinsics) {
1497         return false;
1498       }
1499       break;
1500     case Op_StrIndexOfChar:
1501       if (!UseSSE42Intrinsics) {
1502         return false;
1503       }
1504       break;
1505     case Op_OnSpinWait:
1506       if (VM_Version::supports_on_spin_wait() == false) {
1507         return false;
1508       }
1509       break;
1510     case Op_MulVB:
1511     case Op_LShiftVB:
1512     case Op_RShiftVB:
1513     case Op_URShiftVB:
1514     case Op_VectorInsert:
1515     case Op_VectorLoadMask:
1516     case Op_VectorStoreMask:
1517     case Op_VectorBlend:
1518       if (UseSSE < 4) {
1519         return false;
1520       }
1521       break;
1522 #ifdef _LP64
1523     case Op_MaxD:
1524     case Op_MaxF:
1525     case Op_MinD:
1526     case Op_MinF:
1527       if (UseAVX < 1) { // enabled for AVX only
1528         return false;
1529       }
1530       break;
1531 #endif
1532     case Op_CacheWB:
1533     case Op_CacheWBPreSync:
1534     case Op_CacheWBPostSync:
1535       if (!VM_Version::supports_data_cache_line_flush()) {
1536         return false;
1537       }
1538       break;
1539     case Op_ExtractB:
1540     case Op_ExtractL:
1541     case Op_ExtractI:
1542     case Op_RoundDoubleMode:
1543       if (UseSSE < 4) {
1544         return false;
1545       }
1546       break;
1547     case Op_RoundDoubleModeV:
1548       if (VM_Version::supports_avx() == false) {
1549         return false; // 128bit vroundpd is not available
1550       }
1551       break;
1552     case Op_LoadVectorGather:
1553       if (UseAVX < 2) {
1554         return false;
1555       }
1556       break;
1557     case Op_FmaVD:
1558     case Op_FmaVF:
1559       if (!UseFMA) {
1560         return false;
1561       }
1562       break;
1563     case Op_MacroLogicV:
1564       if (UseAVX < 3 || !UseVectorMacroLogic) {
1565         return false;
1566       }
1567       break;
1568 
1569     case Op_VectorCmpMasked:
1570     case Op_VectorMaskGen:
1571     case Op_LoadVectorMasked:
1572     case Op_StoreVectorMasked:
1573       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
1574         return false;
1575       }
1576       break;
1577     case Op_VectorMaskFirstTrue:
1578     case Op_VectorMaskLastTrue:
1579     case Op_VectorMaskTrueCount:
1580     case Op_VectorMaskToLong:
1581       if (!is_LP64 || UseAVX < 1) {
1582          return false;
1583       }
1584       break;
1585     case Op_RoundF:
1586     case Op_RoundD:
1587       if (!is_LP64) {
1588         return false;
1589       }
1590       break;
1591     case Op_CopySignD:
1592     case Op_CopySignF:
1593       if (UseAVX < 3 || !is_LP64)  {
1594         return false;
1595       }
1596       if (!VM_Version::supports_avx512vl()) {
1597         return false;
1598       }
1599       break;
1600 #ifndef _LP64
1601     case Op_AddReductionVF:
1602     case Op_AddReductionVD:
1603     case Op_MulReductionVF:
1604     case Op_MulReductionVD:
1605       if (UseSSE < 1) { // requires at least SSE
1606         return false;
1607       }
1608       break;
1609     case Op_MulAddVS2VI:
1610     case Op_RShiftVL:
1611     case Op_AbsVD:
1612     case Op_NegVD:
1613       if (UseSSE < 2) {
1614         return false;
1615       }
1616       break;
1617 #endif // !LP64
1618     case Op_SignumF:
1619       if (UseSSE < 1) {
1620         return false;
1621       }
1622       break;
1623     case Op_SignumD:
1624       if (UseSSE < 2) {
1625         return false;
1626       }
1627       break;
1628     case Op_SqrtF:
1629       if (UseSSE < 1) {
1630         return false;
1631       }
1632       break;
1633     case Op_SqrtD:
1634 #ifdef _LP64
1635       if (UseSSE < 2) {
1636         return false;
1637       }
1638 #else
1639       // x86_32.ad has a special match rule for SqrtD.
1640       // Together with common x86 rules, this handles all UseSSE cases.
1641 #endif
1642       break;
1643   }
1644   return true;  // Match rules are supported by default.
1645 }
1646 
1647 //------------------------------------------------------------------------
1648 
1649 // Identify extra cases that we might want to provide match rules for vector nodes and
1650 // other intrinsics guarded with vector length (vlen) and element type (bt).
1651 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
1652   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1653   if (!match_rule_supported(opcode)) {
1654     return false;
1655   }
1656   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
1657   //   * SSE2 supports 128bit vectors for all types;
1658   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
1659   //   * AVX2 supports 256bit vectors for all types;
1660   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
1661   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1662   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1663   // And MaxVectorSize is taken into account as well.
1664   if (!vector_size_supported(bt, vlen)) {
1665     return false;
1666   }
1667   // Special cases which require vector length follow:
1668   //   * implementation limitations
1669   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1670   //   * 128bit vroundpd instruction is present only in AVX1
1671   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1672   switch (opcode) {
1673     case Op_AbsVF:
1674     case Op_NegVF:
1675       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1676         return false; // 512bit vandps and vxorps are not available
1677       }
1678       break;
1679     case Op_AbsVD:
1680     case Op_NegVD:
1681     case Op_MulVL:
1682       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1683         return false; // 512bit vpmullq, vandpd and vxorpd are not available
1684       }
1685       break;
1686     case Op_CMoveVF:
1687       if (vlen != 8) {
1688         return false; // implementation limitation (only vcmov8F_reg is present)
1689       }
1690       break;
1691     case Op_RotateRightV:
1692     case Op_RotateLeftV:
1693       if (bt != T_INT && bt != T_LONG) {
1694         return false;
1695       } // fallthrough
1696     case Op_MacroLogicV:
1697       if (!VM_Version::supports_evex() ||
1698           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
1699         return false;
1700       }
1701       break;
1702     case Op_ClearArray:
1703     case Op_VectorMaskGen:
1704     case Op_VectorCmpMasked:
1705     case Op_LoadVectorMasked:
1706     case Op_StoreVectorMasked:
1707       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
1708         return false;
1709       }
1710       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
1711         return false;
1712       }
1713       break;
1714     case Op_CMoveVD:
1715       if (vlen != 4) {
1716         return false; // implementation limitation (only vcmov4D_reg is present)
1717       }
1718       break;
1719     case Op_MaxV:
1720     case Op_MinV:
1721       if (UseSSE < 4 && is_integral_type(bt)) {
1722         return false;
1723       }
1724       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
1725           // Float/Double intrinsics are enabled for AVX family currently.
1726           if (UseAVX == 0) {
1727             return false;
1728           }
1729           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
1730             return false;
1731           }
1732       }
1733       break;
1734     case Op_CallLeafVector:
1735       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
1736         return false;
1737       }
1738       break;
1739     case Op_AddReductionVI:
1740       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
1741         return false;
1742       }
1743       // fallthrough
1744     case Op_AndReductionV:
1745     case Op_OrReductionV:
1746     case Op_XorReductionV:
1747       if (is_subword_type(bt) && (UseSSE < 4)) {
1748         return false;
1749       }
1750 #ifndef _LP64
1751       if (bt == T_BYTE || bt == T_LONG) {
1752         return false;
1753       }
1754 #endif
1755       break;
1756 #ifndef _LP64
1757     case Op_VectorInsert:
1758       if (bt == T_LONG || bt == T_DOUBLE) {
1759         return false;
1760       }
1761       break;
1762 #endif
1763     case Op_MinReductionV:
1764     case Op_MaxReductionV:
1765       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
1766         return false;
1767       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
1768         return false;
1769       }
1770       // Float/Double intrinsics enabled for AVX family.
1771       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
1772         return false;
1773       }
1774       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
1775         return false;
1776       }
1777 #ifndef _LP64
1778       if (bt == T_BYTE || bt == T_LONG) {
1779         return false;
1780       }
1781 #endif
1782       break;
1783     case Op_VectorTest:
1784       if (UseSSE < 4) {
1785         return false; // Implementation limitation
1786       } else if (size_in_bits < 32) {
1787         return false; // Implementation limitation
1788       } else if (size_in_bits == 512 && (VM_Version::supports_avx512bw() == false)) {
1789         return false; // Implementation limitation
1790       }
1791       break;
1792     case Op_VectorLoadShuffle:
1793     case Op_VectorRearrange:
1794       if(vlen == 2) {
1795         return false; // Implementation limitation due to how shuffle is loaded
1796       } else if (size_in_bits == 256 && UseAVX < 2) {
1797         return false; // Implementation limitation
1798       } else if (bt == T_BYTE && size_in_bits > 256 && !VM_Version::supports_avx512_vbmi())  {
1799         return false; // Implementation limitation
1800       } else if (bt == T_SHORT && size_in_bits > 256 && !VM_Version::supports_avx512bw())  {
1801         return false; // Implementation limitation
1802       }
1803       break;
1804     case Op_VectorLoadMask:
1805       if (size_in_bits == 256 && UseAVX < 2) {
1806         return false; // Implementation limitation
1807       }
1808       // fallthrough
1809     case Op_VectorStoreMask:
1810       if (vlen == 2) {
1811         return false; // Implementation limitation
1812       }
1813       break;
1814     case Op_VectorCastB2X:
1815     case Op_VectorCastS2X:
1816     case Op_VectorCastI2X:
1817       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
1818         return false;
1819       }
1820       break;
1821     case Op_VectorCastL2X:
1822       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1823         return false;
1824       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
1825         return false;
1826       }
1827       break;
1828     case Op_VectorCastD2X:
1829       if (is_subword_type(bt) || bt == T_INT) {
1830         return false;
1831       }
1832       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
1833         return false;
1834       }
1835       break;
1836     case Op_RoundVD:
1837       if (!VM_Version::supports_avx512dq()) {
1838         return false;
1839       }
1840       break;
1841     case Op_VectorCastF2X:
1842       if (is_subword_type(bt) || bt == T_LONG) {
1843         return false;
1844       }
1845       break;
1846     case Op_MulReductionVI:
1847       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
1848         return false;
1849       }
1850       break;
1851     case Op_LoadVectorGatherMasked:
1852     case Op_StoreVectorScatterMasked:
1853     case Op_StoreVectorScatter:
1854       if(is_subword_type(bt)) {
1855         return false;
1856       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1857         return false;
1858       }
1859       // fallthrough
1860     case Op_LoadVectorGather:
1861       if (size_in_bits == 64 ) {
1862         return false;
1863       }
1864       break;
1865     case Op_MaskAll:
1866       if (!VM_Version::supports_evex()) {
1867         return false;
1868       }
1869       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
1870         return false;
1871       }
1872       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1873         return false;
1874       }
1875       break;
1876     case Op_VectorMaskCmp:
1877       if (vlen < 2 || size_in_bits < 32) {
1878         return false;
1879       }
1880       break;
1881     case Op_VectorLongToMask:
1882       if (UseAVX < 1 || !is_LP64) {
1883         return false;
1884       }
1885       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
1886         return false;
1887       }
1888       break;
1889     case Op_PopCountVI:
1890       if (!VM_Version::supports_avx512_vpopcntdq() &&
1891           (vlen == 16) && !VM_Version::supports_avx512bw()) {
1892         return false;
1893       }
1894       break;
1895     case Op_PopCountVL:
1896       if (!VM_Version::supports_avx512_vpopcntdq() &&
1897           ((vlen <= 4) || ((vlen == 8) && !VM_Version::supports_avx512bw()))) {
1898         return false;
1899       }
1900       break;
1901   }
1902   return true;  // Per default match rules are supported.
1903 }
1904 
1905 const bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
1906   // ADLC based match_rule_supported routine checks for the existence of pattern based
1907   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
1908   // of their non-masked counterpart with mask edge being the differentiator.
1909   // This routine does a strict check on the existence of masked operation patterns
1910   // by returning a default false value for all the other opcodes apart from the
1911   // ones whose masked instruction patterns are defined in this file.
1912   if (!match_rule_supported_vector(opcode, vlen, bt)) {
1913     return false;
1914   }
1915 
1916   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1917   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1918   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
1919     return false;
1920   }
1921   switch(opcode) {
1922     // Unary masked operations
1923     case Op_AbsVB:
1924     case Op_AbsVS:
1925       if(!VM_Version::supports_avx512bw()) {
1926         return false;  // Implementation limitation
1927       }
1928     case Op_AbsVI:
1929     case Op_AbsVL:
1930       return true;
1931 
1932     // Ternary masked operations
1933     case Op_FmaVF:
1934     case Op_FmaVD:
1935       return true;
1936 
1937     case Op_MacroLogicV:
1938       if(bt != T_INT && bt != T_LONG) {
1939         return false;
1940       }
1941       return true;
1942 
1943     // Binary masked operations
1944     case Op_AddVB:
1945     case Op_AddVS:
1946     case Op_SubVB:
1947     case Op_SubVS:
1948     case Op_MulVS:
1949     case Op_LShiftVS:
1950     case Op_RShiftVS:
1951     case Op_URShiftVS:
1952       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
1953       if (!VM_Version::supports_avx512bw()) {
1954         return false;  // Implementation limitation
1955       }
1956       return true;
1957 
1958     case Op_MulVL:
1959       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
1960       if (!VM_Version::supports_avx512dq()) {
1961         return false;  // Implementation limitation
1962       }
1963       return true;
1964 
1965     case Op_AndV:
1966     case Op_OrV:
1967     case Op_XorV:
1968     case Op_RotateRightV:
1969     case Op_RotateLeftV:
1970       if (bt != T_INT && bt != T_LONG) {
1971         return false; // Implementation limitation
1972       }
1973       return true;
1974 
1975     case Op_VectorLoadMask:
1976       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
1977       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
1978         return false;
1979       }
1980       return true;
1981 
1982     case Op_AddVI:
1983     case Op_AddVL:
1984     case Op_AddVF:
1985     case Op_AddVD:
1986     case Op_SubVI:
1987     case Op_SubVL:
1988     case Op_SubVF:
1989     case Op_SubVD:
1990     case Op_MulVI:
1991     case Op_MulVF:
1992     case Op_MulVD:
1993     case Op_DivVF:
1994     case Op_DivVD:
1995     case Op_SqrtVF:
1996     case Op_SqrtVD:
1997     case Op_LShiftVI:
1998     case Op_LShiftVL:
1999     case Op_RShiftVI:
2000     case Op_RShiftVL:
2001     case Op_URShiftVI:
2002     case Op_URShiftVL:
2003     case Op_LoadVectorMasked:
2004     case Op_StoreVectorMasked:
2005     case Op_LoadVectorGatherMasked:
2006     case Op_StoreVectorScatterMasked:
2007       return true;
2008 
2009     case Op_MaxV:
2010     case Op_MinV:
2011       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
2012         return false; // Implementation limitation
2013       }
2014       if (is_floating_point_type(bt)) {
2015         return false; // Implementation limitation
2016       }
2017       return true;
2018 
2019     case Op_VectorMaskCmp:
2020       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
2021         return false; // Implementation limitation
2022       }
2023       return true;
2024 
2025     case Op_VectorRearrange:
2026       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
2027         return false; // Implementation limitation
2028       }
2029       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
2030         return false; // Implementation limitation
2031       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
2032         return false; // Implementation limitation
2033       }
2034       return true;
2035 
2036     // Binary Logical operations
2037     case Op_AndVMask:
2038     case Op_OrVMask:
2039     case Op_XorVMask:
2040       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
2041         return false; // Implementation limitation
2042       }
2043       return true;
2044 
2045     case Op_MaskAll:
2046       return true;
2047 
2048     default:
2049       return false;
2050   }
2051 }
2052 
2053 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
2054   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
2055   bool legacy = (generic_opnd->opcode() == LEGVEC);
2056   if (!VM_Version::supports_avx512vlbwdq() && // KNL
2057       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
2058     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
2059     return new legVecZOper();
2060   }
2061   if (legacy) {
2062     switch (ideal_reg) {
2063       case Op_VecS: return new legVecSOper();
2064       case Op_VecD: return new legVecDOper();
2065       case Op_VecX: return new legVecXOper();
2066       case Op_VecY: return new legVecYOper();
2067       case Op_VecZ: return new legVecZOper();
2068     }
2069   } else {
2070     switch (ideal_reg) {
2071       case Op_VecS: return new vecSOper();
2072       case Op_VecD: return new vecDOper();
2073       case Op_VecX: return new vecXOper();
2074       case Op_VecY: return new vecYOper();
2075       case Op_VecZ: return new vecZOper();
2076     }
2077   }
2078   ShouldNotReachHere();
2079   return NULL;
2080 }
2081 
2082 bool Matcher::is_reg2reg_move(MachNode* m) {
2083   switch (m->rule()) {
2084     case MoveVec2Leg_rule:
2085     case MoveLeg2Vec_rule:
2086     case MoveF2VL_rule:
2087     case MoveF2LEG_rule:
2088     case MoveVL2F_rule:
2089     case MoveLEG2F_rule:
2090     case MoveD2VL_rule:
2091     case MoveD2LEG_rule:
2092     case MoveVL2D_rule:
2093     case MoveLEG2D_rule:
2094       return true;
2095     default:
2096       return false;
2097   }
2098 }
2099 
2100 bool Matcher::is_generic_vector(MachOper* opnd) {
2101   switch (opnd->opcode()) {
2102     case VEC:
2103     case LEGVEC:
2104       return true;
2105     default:
2106       return false;
2107   }
2108 }
2109 
2110 //------------------------------------------------------------------------
2111 
2112 const RegMask* Matcher::predicate_reg_mask(void) {
2113   return &_VECTMASK_REG_mask;
2114 }
2115 
2116 const TypeVectMask* Matcher::predicate_reg_type(const Type* elemTy, int length) {
2117   return new TypeVectMask(elemTy, length);
2118 }
2119 
2120 // Max vector size in bytes. 0 if not supported.
2121 const int Matcher::vector_width_in_bytes(BasicType bt) {
2122   assert(is_java_primitive(bt), "only primitive type vectors");
2123   if (UseSSE < 2) return 0;
2124   // SSE2 supports 128bit vectors for all types.
2125   // AVX2 supports 256bit vectors for all types.
2126   // AVX2/EVEX supports 512bit vectors for all types.
2127   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
2128   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
2129   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
2130     size = (UseAVX > 2) ? 64 : 32;
2131   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
2132     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
2133   // Use flag to limit vector size.
2134   size = MIN2(size,(int)MaxVectorSize);
2135   // Minimum 2 values in vector (or 4 for bytes).
2136   switch (bt) {
2137   case T_DOUBLE:
2138   case T_LONG:
2139     if (size < 16) return 0;
2140     break;
2141   case T_FLOAT:
2142   case T_INT:
2143     if (size < 8) return 0;
2144     break;
2145   case T_BOOLEAN:
2146     if (size < 4) return 0;
2147     break;
2148   case T_CHAR:
2149     if (size < 4) return 0;
2150     break;
2151   case T_BYTE:
2152     if (size < 4) return 0;
2153     break;
2154   case T_SHORT:
2155     if (size < 4) return 0;
2156     break;
2157   default:
2158     ShouldNotReachHere();
2159   }
2160   return size;
2161 }
2162 
2163 // Limits on vector size (number of elements) loaded into vector.
2164 const int Matcher::max_vector_size(const BasicType bt) {
2165   return vector_width_in_bytes(bt)/type2aelembytes(bt);
2166 }
2167 const int Matcher::min_vector_size(const BasicType bt) {
2168   int max_size = max_vector_size(bt);
2169   // Min size which can be loaded into vector is 4 bytes.
2170   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
2171   // Support for calling svml double64 vectors
2172   if (bt == T_DOUBLE) {
2173     size = 1;
2174   }
2175   return MIN2(size,max_size);
2176 }
2177 
2178 const int Matcher::scalable_vector_reg_size(const BasicType bt) {
2179   return -1;
2180 }
2181 
2182 // Vector ideal reg corresponding to specified size in bytes
2183 const uint Matcher::vector_ideal_reg(int size) {
2184   assert(MaxVectorSize >= size, "");
2185   switch(size) {
2186     case  4: return Op_VecS;
2187     case  8: return Op_VecD;
2188     case 16: return Op_VecX;
2189     case 32: return Op_VecY;
2190     case 64: return Op_VecZ;
2191   }
2192   ShouldNotReachHere();
2193   return 0;
2194 }
2195 
2196 // Check for shift by small constant as well
2197 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
2198   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
2199       shift->in(2)->get_int() <= 3 &&
2200       // Are there other uses besides address expressions?
2201       !matcher->is_visited(shift)) {
2202     address_visited.set(shift->_idx); // Flag as address_visited
2203     mstack.push(shift->in(2), Matcher::Visit);
2204     Node *conv = shift->in(1);
2205 #ifdef _LP64
2206     // Allow Matcher to match the rule which bypass
2207     // ConvI2L operation for an array index on LP64
2208     // if the index value is positive.
2209     if (conv->Opcode() == Op_ConvI2L &&
2210         conv->as_Type()->type()->is_long()->_lo >= 0 &&
2211         // Are there other uses besides address expressions?
2212         !matcher->is_visited(conv)) {
2213       address_visited.set(conv->_idx); // Flag as address_visited
2214       mstack.push(conv->in(1), Matcher::Pre_Visit);
2215     } else
2216 #endif
2217       mstack.push(conv, Matcher::Pre_Visit);
2218     return true;
2219   }
2220   return false;
2221 }
2222 
2223 // This function identifies sub-graphs in which a 'load' node is
2224 // input to two different nodes, and such that it can be matched
2225 // with BMI instructions like blsi, blsr, etc.
2226 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
2227 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
2228 // refers to the same node.
2229 //
2230 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
2231 // This is a temporary solution until we make DAGs expressible in ADL.
2232 template<typename ConType>
2233 class FusedPatternMatcher {
2234   Node* _op1_node;
2235   Node* _mop_node;
2236   int _con_op;
2237 
2238   static int match_next(Node* n, int next_op, int next_op_idx) {
2239     if (n->in(1) == NULL || n->in(2) == NULL) {
2240       return -1;
2241     }
2242 
2243     if (next_op_idx == -1) { // n is commutative, try rotations
2244       if (n->in(1)->Opcode() == next_op) {
2245         return 1;
2246       } else if (n->in(2)->Opcode() == next_op) {
2247         return 2;
2248       }
2249     } else {
2250       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
2251       if (n->in(next_op_idx)->Opcode() == next_op) {
2252         return next_op_idx;
2253       }
2254     }
2255     return -1;
2256   }
2257 
2258  public:
2259   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
2260     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
2261 
2262   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
2263              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
2264              typename ConType::NativeType con_value) {
2265     if (_op1_node->Opcode() != op1) {
2266       return false;
2267     }
2268     if (_mop_node->outcnt() > 2) {
2269       return false;
2270     }
2271     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
2272     if (op1_op2_idx == -1) {
2273       return false;
2274     }
2275     // Memory operation must be the other edge
2276     int op1_mop_idx = (op1_op2_idx & 1) + 1;
2277 
2278     // Check that the mop node is really what we want
2279     if (_op1_node->in(op1_mop_idx) == _mop_node) {
2280       Node* op2_node = _op1_node->in(op1_op2_idx);
2281       if (op2_node->outcnt() > 1) {
2282         return false;
2283       }
2284       assert(op2_node->Opcode() == op2, "Should be");
2285       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
2286       if (op2_con_idx == -1) {
2287         return false;
2288       }
2289       // Memory operation must be the other edge
2290       int op2_mop_idx = (op2_con_idx & 1) + 1;
2291       // Check that the memory operation is the same node
2292       if (op2_node->in(op2_mop_idx) == _mop_node) {
2293         // Now check the constant
2294         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
2295         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
2296           return true;
2297         }
2298       }
2299     }
2300     return false;
2301   }
2302 };
2303 
2304 static bool is_bmi_pattern(Node* n, Node* m) {
2305   assert(UseBMI1Instructions, "sanity");
2306   if (n != NULL && m != NULL) {
2307     if (m->Opcode() == Op_LoadI) {
2308       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
2309       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
2310              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
2311              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
2312     } else if (m->Opcode() == Op_LoadL) {
2313       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
2314       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
2315              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
2316              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
2317     }
2318   }
2319   return false;
2320 }
2321 
2322 // Should the matcher clone input 'm' of node 'n'?
2323 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
2324   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
2325   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
2326     mstack.push(m, Visit);
2327     return true;
2328   }
2329   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
2330     mstack.push(m, Visit);           // m = ShiftCntV
2331     return true;
2332   }
2333   return false;
2334 }
2335 
2336 // Should the Matcher clone shifts on addressing modes, expecting them
2337 // to be subsumed into complex addressing expressions or compute them
2338 // into registers?
2339 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
2340   Node *off = m->in(AddPNode::Offset);
2341   if (off->is_Con()) {
2342     address_visited.test_set(m->_idx); // Flag as address_visited
2343     Node *adr = m->in(AddPNode::Address);
2344 
2345     // Intel can handle 2 adds in addressing mode
2346     // AtomicAdd is not an addressing expression.
2347     // Cheap to find it by looking for screwy base.
2348     if (adr->is_AddP() &&
2349         !adr->in(AddPNode::Base)->is_top() &&
2350         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
2351         // Are there other uses besides address expressions?
2352         !is_visited(adr)) {
2353       address_visited.set(adr->_idx); // Flag as address_visited
2354       Node *shift = adr->in(AddPNode::Offset);
2355       if (!clone_shift(shift, this, mstack, address_visited)) {
2356         mstack.push(shift, Pre_Visit);
2357       }
2358       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
2359       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
2360     } else {
2361       mstack.push(adr, Pre_Visit);
2362     }
2363 
2364     // Clone X+offset as it also folds into most addressing expressions
2365     mstack.push(off, Visit);
2366     mstack.push(m->in(AddPNode::Base), Pre_Visit);
2367     return true;
2368   } else if (clone_shift(off, this, mstack, address_visited)) {
2369     address_visited.test_set(m->_idx); // Flag as address_visited
2370     mstack.push(m->in(AddPNode::Address), Pre_Visit);
2371     mstack.push(m->in(AddPNode::Base), Pre_Visit);
2372     return true;
2373   }
2374   return false;
2375 }
2376 
2377 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
2378   switch (bt) {
2379     case BoolTest::eq:
2380       return Assembler::eq;
2381     case BoolTest::ne:
2382       return Assembler::neq;
2383     case BoolTest::le:
2384     case BoolTest::ule:
2385       return Assembler::le;
2386     case BoolTest::ge:
2387     case BoolTest::uge:
2388       return Assembler::nlt;
2389     case BoolTest::lt:
2390     case BoolTest::ult:
2391       return Assembler::lt;
2392     case BoolTest::gt:
2393     case BoolTest::ugt:
2394       return Assembler::nle;
2395     default : ShouldNotReachHere(); return Assembler::_false;
2396   }
2397 }
2398 
2399 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
2400   switch (bt) {
2401   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
2402   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
2403   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
2404   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
2405   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
2406   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
2407   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
2408   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
2409   }
2410 }
2411 
2412 // Helper methods for MachSpillCopyNode::implementation().
2413 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
2414                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
2415   assert(ireg == Op_VecS || // 32bit vector
2416          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
2417          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
2418          "no non-adjacent vector moves" );
2419   if (cbuf) {
2420     C2_MacroAssembler _masm(cbuf);
2421     switch (ireg) {
2422     case Op_VecS: // copy whole register
2423     case Op_VecD:
2424     case Op_VecX:
2425 #ifndef _LP64
2426       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2427 #else
2428       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2429         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2430       } else {
2431         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2432      }
2433 #endif
2434       break;
2435     case Op_VecY:
2436 #ifndef _LP64
2437       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2438 #else
2439       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2440         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2441       } else {
2442         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2443      }
2444 #endif
2445       break;
2446     case Op_VecZ:
2447       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
2448       break;
2449     default:
2450       ShouldNotReachHere();
2451     }
2452 #ifndef PRODUCT
2453   } else {
2454     switch (ireg) {
2455     case Op_VecS:
2456     case Op_VecD:
2457     case Op_VecX:
2458       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2459       break;
2460     case Op_VecY:
2461     case Op_VecZ:
2462       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2463       break;
2464     default:
2465       ShouldNotReachHere();
2466     }
2467 #endif
2468   }
2469 }
2470 
2471 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
2472                      int stack_offset, int reg, uint ireg, outputStream* st) {
2473   if (cbuf) {
2474     C2_MacroAssembler _masm(cbuf);
2475     if (is_load) {
2476       switch (ireg) {
2477       case Op_VecS:
2478         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2479         break;
2480       case Op_VecD:
2481         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2482         break;
2483       case Op_VecX:
2484 #ifndef _LP64
2485         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2486 #else
2487         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2488           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2489         } else {
2490           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2491           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2492         }
2493 #endif
2494         break;
2495       case Op_VecY:
2496 #ifndef _LP64
2497         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2498 #else
2499         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2500           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2501         } else {
2502           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2503           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2504         }
2505 #endif
2506         break;
2507       case Op_VecZ:
2508         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
2509         break;
2510       default:
2511         ShouldNotReachHere();
2512       }
2513     } else { // store
2514       switch (ireg) {
2515       case Op_VecS:
2516         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2517         break;
2518       case Op_VecD:
2519         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2520         break;
2521       case Op_VecX:
2522 #ifndef _LP64
2523         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2524 #else
2525         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2526           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2527         }
2528         else {
2529           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2530         }
2531 #endif
2532         break;
2533       case Op_VecY:
2534 #ifndef _LP64
2535         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2536 #else
2537         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2538           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2539         }
2540         else {
2541           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2542         }
2543 #endif
2544         break;
2545       case Op_VecZ:
2546         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2547         break;
2548       default:
2549         ShouldNotReachHere();
2550       }
2551     }
2552 #ifndef PRODUCT
2553   } else {
2554     if (is_load) {
2555       switch (ireg) {
2556       case Op_VecS:
2557         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2558         break;
2559       case Op_VecD:
2560         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2561         break;
2562        case Op_VecX:
2563         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2564         break;
2565       case Op_VecY:
2566       case Op_VecZ:
2567         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2568         break;
2569       default:
2570         ShouldNotReachHere();
2571       }
2572     } else { // store
2573       switch (ireg) {
2574       case Op_VecS:
2575         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2576         break;
2577       case Op_VecD:
2578         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2579         break;
2580        case Op_VecX:
2581         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2582         break;
2583       case Op_VecY:
2584       case Op_VecZ:
2585         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2586         break;
2587       default:
2588         ShouldNotReachHere();
2589       }
2590     }
2591 #endif
2592   }
2593 }
2594 
2595 template <class T>
2596 static inline GrowableArray<jvalue>* vreplicate_imm(BasicType bt, T con, int len) {
2597   GrowableArray<jvalue>* val = new GrowableArray<jvalue>(len);
2598   jvalue ele;
2599   switch (bt) {
2600     case T_BYTE:   ele.b = con; break;
2601     case T_SHORT:  ele.s = con; break;
2602     case T_INT:    ele.i = con; break;
2603     case T_LONG:   ele.j = con; break;
2604     case T_FLOAT:  ele.f = con; break;
2605     case T_DOUBLE: ele.d = con; break;
2606     default: ShouldNotReachHere();
2607   }
2608   for (int i = 0; i < len; i++) {
2609     val->append(ele);
2610   }
2611   return val;
2612 }
2613 
2614 static inline jlong high_bit_set(BasicType bt) {
2615   switch (bt) {
2616     case T_BYTE:  return 0x8080808080808080;
2617     case T_SHORT: return 0x8000800080008000;
2618     case T_INT:   return 0x8000000080000000;
2619     case T_LONG:  return 0x8000000000000000;
2620     default:
2621       ShouldNotReachHere();
2622       return 0;
2623   }
2624 }
2625 
2626 #ifndef PRODUCT
2627   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
2628     st->print("nop \t# %d bytes pad for loops and calls", _count);
2629   }
2630 #endif
2631 
2632   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
2633     C2_MacroAssembler _masm(&cbuf);
2634     __ nop(_count);
2635   }
2636 
2637   uint MachNopNode::size(PhaseRegAlloc*) const {
2638     return _count;
2639   }
2640 
2641 #ifndef PRODUCT
2642   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2643     st->print("# breakpoint");
2644   }
2645 #endif
2646 
2647   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
2648     C2_MacroAssembler _masm(&cbuf);
2649     __ int3();
2650   }
2651 
2652   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2653     return MachNode::size(ra_);
2654   }
2655 
2656 %}
2657 
2658 encode %{
2659 
2660   enc_class call_epilog %{
2661     if (VerifyStackAtCalls) {
2662       // Check that stack depth is unchanged: find majik cookie on stack
2663       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2664       C2_MacroAssembler _masm(&cbuf);
2665       Label L;
2666       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2667       __ jccb(Assembler::equal, L);
2668       // Die if stack mismatch
2669       __ int3();
2670       __ bind(L);
2671     }
2672   %}
2673 
2674 %}
2675 
2676 // Operands for bound floating pointer register arguments
2677 operand rxmm0() %{
2678   constraint(ALLOC_IN_RC(xmm0_reg));
2679   match(VecX);
2680   format%{%}
2681   interface(REG_INTER);
2682 %}
2683 
2684 //----------OPERANDS-----------------------------------------------------------
2685 // Operand definitions must precede instruction definitions for correct parsing
2686 // in the ADLC because operands constitute user defined types which are used in
2687 // instruction definitions.
2688 
2689 // Vectors
2690 
2691 // Dummy generic vector class. Should be used for all vector operands.
2692 // Replaced with vec[SDXYZ] during post-selection pass.
2693 operand vec() %{
2694   constraint(ALLOC_IN_RC(dynamic));
2695   match(VecX);
2696   match(VecY);
2697   match(VecZ);
2698   match(VecS);
2699   match(VecD);
2700 
2701   format %{ %}
2702   interface(REG_INTER);
2703 %}
2704 
2705 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
2706 // Replaced with legVec[SDXYZ] during post-selection cleanup.
2707 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
2708 // runtime code generation via reg_class_dynamic.
2709 operand legVec() %{
2710   constraint(ALLOC_IN_RC(dynamic));
2711   match(VecX);
2712   match(VecY);
2713   match(VecZ);
2714   match(VecS);
2715   match(VecD);
2716 
2717   format %{ %}
2718   interface(REG_INTER);
2719 %}
2720 
2721 // Replaces vec during post-selection cleanup. See above.
2722 operand vecS() %{
2723   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
2724   match(VecS);
2725 
2726   format %{ %}
2727   interface(REG_INTER);
2728 %}
2729 
2730 // Replaces legVec during post-selection cleanup. See above.
2731 operand legVecS() %{
2732   constraint(ALLOC_IN_RC(vectors_reg_legacy));
2733   match(VecS);
2734 
2735   format %{ %}
2736   interface(REG_INTER);
2737 %}
2738 
2739 // Replaces vec during post-selection cleanup. See above.
2740 operand vecD() %{
2741   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
2742   match(VecD);
2743 
2744   format %{ %}
2745   interface(REG_INTER);
2746 %}
2747 
2748 // Replaces legVec during post-selection cleanup. See above.
2749 operand legVecD() %{
2750   constraint(ALLOC_IN_RC(vectord_reg_legacy));
2751   match(VecD);
2752 
2753   format %{ %}
2754   interface(REG_INTER);
2755 %}
2756 
2757 // Replaces vec during post-selection cleanup. See above.
2758 operand vecX() %{
2759   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
2760   match(VecX);
2761 
2762   format %{ %}
2763   interface(REG_INTER);
2764 %}
2765 
2766 // Replaces legVec during post-selection cleanup. See above.
2767 operand legVecX() %{
2768   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
2769   match(VecX);
2770 
2771   format %{ %}
2772   interface(REG_INTER);
2773 %}
2774 
2775 // Replaces vec during post-selection cleanup. See above.
2776 operand vecY() %{
2777   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
2778   match(VecY);
2779 
2780   format %{ %}
2781   interface(REG_INTER);
2782 %}
2783 
2784 // Replaces legVec during post-selection cleanup. See above.
2785 operand legVecY() %{
2786   constraint(ALLOC_IN_RC(vectory_reg_legacy));
2787   match(VecY);
2788 
2789   format %{ %}
2790   interface(REG_INTER);
2791 %}
2792 
2793 // Replaces vec during post-selection cleanup. See above.
2794 operand vecZ() %{
2795   constraint(ALLOC_IN_RC(vectorz_reg));
2796   match(VecZ);
2797 
2798   format %{ %}
2799   interface(REG_INTER);
2800 %}
2801 
2802 // Replaces legVec during post-selection cleanup. See above.
2803 operand legVecZ() %{
2804   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
2805   match(VecZ);
2806 
2807   format %{ %}
2808   interface(REG_INTER);
2809 %}
2810 
2811 // Comparison Code for FP conditional move
2812 operand cmpOp_vcmppd() %{
2813   match(Bool);
2814 
2815   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2816             n->as_Bool()->_test._test != BoolTest::no_overflow);
2817   format %{ "" %}
2818   interface(COND_INTER) %{
2819     equal        (0x0, "eq");
2820     less         (0x1, "lt");
2821     less_equal   (0x2, "le");
2822     not_equal    (0xC, "ne");
2823     greater_equal(0xD, "ge");
2824     greater      (0xE, "gt");
2825     //TODO cannot compile (adlc breaks) without two next lines with error:
2826     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2827     // equal' for overflow.
2828     overflow     (0x20, "o");  // not really supported by the instruction
2829     no_overflow  (0x21, "no"); // not really supported by the instruction
2830   %}
2831 %}
2832 
2833 
2834 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2835 
2836 // ============================================================================
2837 
2838 instruct ShouldNotReachHere() %{
2839   match(Halt);
2840   format %{ "stop\t# ShouldNotReachHere" %}
2841   ins_encode %{
2842     if (is_reachable()) {
2843       __ stop(_halt_reason);
2844     }
2845   %}
2846   ins_pipe(pipe_slow);
2847 %}
2848 
2849 // ============================================================================
2850 
2851 instruct addF_reg(regF dst, regF src) %{
2852   predicate((UseSSE>=1) && (UseAVX == 0));
2853   match(Set dst (AddF dst src));
2854 
2855   format %{ "addss   $dst, $src" %}
2856   ins_cost(150);
2857   ins_encode %{
2858     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2859   %}
2860   ins_pipe(pipe_slow);
2861 %}
2862 
2863 instruct addF_mem(regF dst, memory src) %{
2864   predicate((UseSSE>=1) && (UseAVX == 0));
2865   match(Set dst (AddF dst (LoadF src)));
2866 
2867   format %{ "addss   $dst, $src" %}
2868   ins_cost(150);
2869   ins_encode %{
2870     __ addss($dst$$XMMRegister, $src$$Address);
2871   %}
2872   ins_pipe(pipe_slow);
2873 %}
2874 
2875 instruct addF_imm(regF dst, immF con) %{
2876   predicate((UseSSE>=1) && (UseAVX == 0));
2877   match(Set dst (AddF dst con));
2878   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2879   ins_cost(150);
2880   ins_encode %{
2881     __ addss($dst$$XMMRegister, $constantaddress($con));
2882   %}
2883   ins_pipe(pipe_slow);
2884 %}
2885 
2886 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2887   predicate(UseAVX > 0);
2888   match(Set dst (AddF src1 src2));
2889 
2890   format %{ "vaddss  $dst, $src1, $src2" %}
2891   ins_cost(150);
2892   ins_encode %{
2893     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2894   %}
2895   ins_pipe(pipe_slow);
2896 %}
2897 
2898 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2899   predicate(UseAVX > 0);
2900   match(Set dst (AddF src1 (LoadF src2)));
2901 
2902   format %{ "vaddss  $dst, $src1, $src2" %}
2903   ins_cost(150);
2904   ins_encode %{
2905     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2906   %}
2907   ins_pipe(pipe_slow);
2908 %}
2909 
2910 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2911   predicate(UseAVX > 0);
2912   match(Set dst (AddF src con));
2913 
2914   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2915   ins_cost(150);
2916   ins_encode %{
2917     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2918   %}
2919   ins_pipe(pipe_slow);
2920 %}
2921 
2922 instruct addD_reg(regD dst, regD src) %{
2923   predicate((UseSSE>=2) && (UseAVX == 0));
2924   match(Set dst (AddD dst src));
2925 
2926   format %{ "addsd   $dst, $src" %}
2927   ins_cost(150);
2928   ins_encode %{
2929     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2930   %}
2931   ins_pipe(pipe_slow);
2932 %}
2933 
2934 instruct addD_mem(regD dst, memory src) %{
2935   predicate((UseSSE>=2) && (UseAVX == 0));
2936   match(Set dst (AddD dst (LoadD src)));
2937 
2938   format %{ "addsd   $dst, $src" %}
2939   ins_cost(150);
2940   ins_encode %{
2941     __ addsd($dst$$XMMRegister, $src$$Address);
2942   %}
2943   ins_pipe(pipe_slow);
2944 %}
2945 
2946 instruct addD_imm(regD dst, immD con) %{
2947   predicate((UseSSE>=2) && (UseAVX == 0));
2948   match(Set dst (AddD dst con));
2949   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2950   ins_cost(150);
2951   ins_encode %{
2952     __ addsd($dst$$XMMRegister, $constantaddress($con));
2953   %}
2954   ins_pipe(pipe_slow);
2955 %}
2956 
2957 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2958   predicate(UseAVX > 0);
2959   match(Set dst (AddD src1 src2));
2960 
2961   format %{ "vaddsd  $dst, $src1, $src2" %}
2962   ins_cost(150);
2963   ins_encode %{
2964     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2965   %}
2966   ins_pipe(pipe_slow);
2967 %}
2968 
2969 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2970   predicate(UseAVX > 0);
2971   match(Set dst (AddD src1 (LoadD src2)));
2972 
2973   format %{ "vaddsd  $dst, $src1, $src2" %}
2974   ins_cost(150);
2975   ins_encode %{
2976     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2977   %}
2978   ins_pipe(pipe_slow);
2979 %}
2980 
2981 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2982   predicate(UseAVX > 0);
2983   match(Set dst (AddD src con));
2984 
2985   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2986   ins_cost(150);
2987   ins_encode %{
2988     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2989   %}
2990   ins_pipe(pipe_slow);
2991 %}
2992 
2993 instruct subF_reg(regF dst, regF src) %{
2994   predicate((UseSSE>=1) && (UseAVX == 0));
2995   match(Set dst (SubF dst src));
2996 
2997   format %{ "subss   $dst, $src" %}
2998   ins_cost(150);
2999   ins_encode %{
3000     __ subss($dst$$XMMRegister, $src$$XMMRegister);
3001   %}
3002   ins_pipe(pipe_slow);
3003 %}
3004 
3005 instruct subF_mem(regF dst, memory src) %{
3006   predicate((UseSSE>=1) && (UseAVX == 0));
3007   match(Set dst (SubF dst (LoadF src)));
3008 
3009   format %{ "subss   $dst, $src" %}
3010   ins_cost(150);
3011   ins_encode %{
3012     __ subss($dst$$XMMRegister, $src$$Address);
3013   %}
3014   ins_pipe(pipe_slow);
3015 %}
3016 
3017 instruct subF_imm(regF dst, immF con) %{
3018   predicate((UseSSE>=1) && (UseAVX == 0));
3019   match(Set dst (SubF dst con));
3020   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3021   ins_cost(150);
3022   ins_encode %{
3023     __ subss($dst$$XMMRegister, $constantaddress($con));
3024   %}
3025   ins_pipe(pipe_slow);
3026 %}
3027 
3028 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
3029   predicate(UseAVX > 0);
3030   match(Set dst (SubF src1 src2));
3031 
3032   format %{ "vsubss  $dst, $src1, $src2" %}
3033   ins_cost(150);
3034   ins_encode %{
3035     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3036   %}
3037   ins_pipe(pipe_slow);
3038 %}
3039 
3040 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
3041   predicate(UseAVX > 0);
3042   match(Set dst (SubF src1 (LoadF src2)));
3043 
3044   format %{ "vsubss  $dst, $src1, $src2" %}
3045   ins_cost(150);
3046   ins_encode %{
3047     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3048   %}
3049   ins_pipe(pipe_slow);
3050 %}
3051 
3052 instruct subF_reg_imm(regF dst, regF src, immF con) %{
3053   predicate(UseAVX > 0);
3054   match(Set dst (SubF src con));
3055 
3056   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3057   ins_cost(150);
3058   ins_encode %{
3059     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3060   %}
3061   ins_pipe(pipe_slow);
3062 %}
3063 
3064 instruct subD_reg(regD dst, regD src) %{
3065   predicate((UseSSE>=2) && (UseAVX == 0));
3066   match(Set dst (SubD dst src));
3067 
3068   format %{ "subsd   $dst, $src" %}
3069   ins_cost(150);
3070   ins_encode %{
3071     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
3072   %}
3073   ins_pipe(pipe_slow);
3074 %}
3075 
3076 instruct subD_mem(regD dst, memory src) %{
3077   predicate((UseSSE>=2) && (UseAVX == 0));
3078   match(Set dst (SubD dst (LoadD src)));
3079 
3080   format %{ "subsd   $dst, $src" %}
3081   ins_cost(150);
3082   ins_encode %{
3083     __ subsd($dst$$XMMRegister, $src$$Address);
3084   %}
3085   ins_pipe(pipe_slow);
3086 %}
3087 
3088 instruct subD_imm(regD dst, immD con) %{
3089   predicate((UseSSE>=2) && (UseAVX == 0));
3090   match(Set dst (SubD dst con));
3091   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3092   ins_cost(150);
3093   ins_encode %{
3094     __ subsd($dst$$XMMRegister, $constantaddress($con));
3095   %}
3096   ins_pipe(pipe_slow);
3097 %}
3098 
3099 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
3100   predicate(UseAVX > 0);
3101   match(Set dst (SubD src1 src2));
3102 
3103   format %{ "vsubsd  $dst, $src1, $src2" %}
3104   ins_cost(150);
3105   ins_encode %{
3106     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3107   %}
3108   ins_pipe(pipe_slow);
3109 %}
3110 
3111 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
3112   predicate(UseAVX > 0);
3113   match(Set dst (SubD src1 (LoadD src2)));
3114 
3115   format %{ "vsubsd  $dst, $src1, $src2" %}
3116   ins_cost(150);
3117   ins_encode %{
3118     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3119   %}
3120   ins_pipe(pipe_slow);
3121 %}
3122 
3123 instruct subD_reg_imm(regD dst, regD src, immD con) %{
3124   predicate(UseAVX > 0);
3125   match(Set dst (SubD src con));
3126 
3127   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3128   ins_cost(150);
3129   ins_encode %{
3130     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3131   %}
3132   ins_pipe(pipe_slow);
3133 %}
3134 
3135 instruct mulF_reg(regF dst, regF src) %{
3136   predicate((UseSSE>=1) && (UseAVX == 0));
3137   match(Set dst (MulF dst src));
3138 
3139   format %{ "mulss   $dst, $src" %}
3140   ins_cost(150);
3141   ins_encode %{
3142     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
3143   %}
3144   ins_pipe(pipe_slow);
3145 %}
3146 
3147 instruct mulF_mem(regF dst, memory src) %{
3148   predicate((UseSSE>=1) && (UseAVX == 0));
3149   match(Set dst (MulF dst (LoadF src)));
3150 
3151   format %{ "mulss   $dst, $src" %}
3152   ins_cost(150);
3153   ins_encode %{
3154     __ mulss($dst$$XMMRegister, $src$$Address);
3155   %}
3156   ins_pipe(pipe_slow);
3157 %}
3158 
3159 instruct mulF_imm(regF dst, immF con) %{
3160   predicate((UseSSE>=1) && (UseAVX == 0));
3161   match(Set dst (MulF dst con));
3162   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3163   ins_cost(150);
3164   ins_encode %{
3165     __ mulss($dst$$XMMRegister, $constantaddress($con));
3166   %}
3167   ins_pipe(pipe_slow);
3168 %}
3169 
3170 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
3171   predicate(UseAVX > 0);
3172   match(Set dst (MulF src1 src2));
3173 
3174   format %{ "vmulss  $dst, $src1, $src2" %}
3175   ins_cost(150);
3176   ins_encode %{
3177     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3178   %}
3179   ins_pipe(pipe_slow);
3180 %}
3181 
3182 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
3183   predicate(UseAVX > 0);
3184   match(Set dst (MulF src1 (LoadF src2)));
3185 
3186   format %{ "vmulss  $dst, $src1, $src2" %}
3187   ins_cost(150);
3188   ins_encode %{
3189     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3190   %}
3191   ins_pipe(pipe_slow);
3192 %}
3193 
3194 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
3195   predicate(UseAVX > 0);
3196   match(Set dst (MulF src con));
3197 
3198   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3199   ins_cost(150);
3200   ins_encode %{
3201     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3202   %}
3203   ins_pipe(pipe_slow);
3204 %}
3205 
3206 instruct mulD_reg(regD dst, regD src) %{
3207   predicate((UseSSE>=2) && (UseAVX == 0));
3208   match(Set dst (MulD dst src));
3209 
3210   format %{ "mulsd   $dst, $src" %}
3211   ins_cost(150);
3212   ins_encode %{
3213     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
3214   %}
3215   ins_pipe(pipe_slow);
3216 %}
3217 
3218 instruct mulD_mem(regD dst, memory src) %{
3219   predicate((UseSSE>=2) && (UseAVX == 0));
3220   match(Set dst (MulD dst (LoadD src)));
3221 
3222   format %{ "mulsd   $dst, $src" %}
3223   ins_cost(150);
3224   ins_encode %{
3225     __ mulsd($dst$$XMMRegister, $src$$Address);
3226   %}
3227   ins_pipe(pipe_slow);
3228 %}
3229 
3230 instruct mulD_imm(regD dst, immD con) %{
3231   predicate((UseSSE>=2) && (UseAVX == 0));
3232   match(Set dst (MulD dst con));
3233   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3234   ins_cost(150);
3235   ins_encode %{
3236     __ mulsd($dst$$XMMRegister, $constantaddress($con));
3237   %}
3238   ins_pipe(pipe_slow);
3239 %}
3240 
3241 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
3242   predicate(UseAVX > 0);
3243   match(Set dst (MulD src1 src2));
3244 
3245   format %{ "vmulsd  $dst, $src1, $src2" %}
3246   ins_cost(150);
3247   ins_encode %{
3248     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3249   %}
3250   ins_pipe(pipe_slow);
3251 %}
3252 
3253 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
3254   predicate(UseAVX > 0);
3255   match(Set dst (MulD src1 (LoadD src2)));
3256 
3257   format %{ "vmulsd  $dst, $src1, $src2" %}
3258   ins_cost(150);
3259   ins_encode %{
3260     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3261   %}
3262   ins_pipe(pipe_slow);
3263 %}
3264 
3265 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
3266   predicate(UseAVX > 0);
3267   match(Set dst (MulD src con));
3268 
3269   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3270   ins_cost(150);
3271   ins_encode %{
3272     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3273   %}
3274   ins_pipe(pipe_slow);
3275 %}
3276 
3277 instruct divF_reg(regF dst, regF src) %{
3278   predicate((UseSSE>=1) && (UseAVX == 0));
3279   match(Set dst (DivF dst src));
3280 
3281   format %{ "divss   $dst, $src" %}
3282   ins_cost(150);
3283   ins_encode %{
3284     __ divss($dst$$XMMRegister, $src$$XMMRegister);
3285   %}
3286   ins_pipe(pipe_slow);
3287 %}
3288 
3289 instruct divF_mem(regF dst, memory src) %{
3290   predicate((UseSSE>=1) && (UseAVX == 0));
3291   match(Set dst (DivF dst (LoadF src)));
3292 
3293   format %{ "divss   $dst, $src" %}
3294   ins_cost(150);
3295   ins_encode %{
3296     __ divss($dst$$XMMRegister, $src$$Address);
3297   %}
3298   ins_pipe(pipe_slow);
3299 %}
3300 
3301 instruct divF_imm(regF dst, immF con) %{
3302   predicate((UseSSE>=1) && (UseAVX == 0));
3303   match(Set dst (DivF dst con));
3304   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3305   ins_cost(150);
3306   ins_encode %{
3307     __ divss($dst$$XMMRegister, $constantaddress($con));
3308   %}
3309   ins_pipe(pipe_slow);
3310 %}
3311 
3312 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
3313   predicate(UseAVX > 0);
3314   match(Set dst (DivF src1 src2));
3315 
3316   format %{ "vdivss  $dst, $src1, $src2" %}
3317   ins_cost(150);
3318   ins_encode %{
3319     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3320   %}
3321   ins_pipe(pipe_slow);
3322 %}
3323 
3324 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
3325   predicate(UseAVX > 0);
3326   match(Set dst (DivF src1 (LoadF src2)));
3327 
3328   format %{ "vdivss  $dst, $src1, $src2" %}
3329   ins_cost(150);
3330   ins_encode %{
3331     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3332   %}
3333   ins_pipe(pipe_slow);
3334 %}
3335 
3336 instruct divF_reg_imm(regF dst, regF src, immF con) %{
3337   predicate(UseAVX > 0);
3338   match(Set dst (DivF src con));
3339 
3340   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3341   ins_cost(150);
3342   ins_encode %{
3343     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3344   %}
3345   ins_pipe(pipe_slow);
3346 %}
3347 
3348 instruct divD_reg(regD dst, regD src) %{
3349   predicate((UseSSE>=2) && (UseAVX == 0));
3350   match(Set dst (DivD dst src));
3351 
3352   format %{ "divsd   $dst, $src" %}
3353   ins_cost(150);
3354   ins_encode %{
3355     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
3356   %}
3357   ins_pipe(pipe_slow);
3358 %}
3359 
3360 instruct divD_mem(regD dst, memory src) %{
3361   predicate((UseSSE>=2) && (UseAVX == 0));
3362   match(Set dst (DivD dst (LoadD src)));
3363 
3364   format %{ "divsd   $dst, $src" %}
3365   ins_cost(150);
3366   ins_encode %{
3367     __ divsd($dst$$XMMRegister, $src$$Address);
3368   %}
3369   ins_pipe(pipe_slow);
3370 %}
3371 
3372 instruct divD_imm(regD dst, immD con) %{
3373   predicate((UseSSE>=2) && (UseAVX == 0));
3374   match(Set dst (DivD dst con));
3375   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3376   ins_cost(150);
3377   ins_encode %{
3378     __ divsd($dst$$XMMRegister, $constantaddress($con));
3379   %}
3380   ins_pipe(pipe_slow);
3381 %}
3382 
3383 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
3384   predicate(UseAVX > 0);
3385   match(Set dst (DivD src1 src2));
3386 
3387   format %{ "vdivsd  $dst, $src1, $src2" %}
3388   ins_cost(150);
3389   ins_encode %{
3390     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3391   %}
3392   ins_pipe(pipe_slow);
3393 %}
3394 
3395 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
3396   predicate(UseAVX > 0);
3397   match(Set dst (DivD src1 (LoadD src2)));
3398 
3399   format %{ "vdivsd  $dst, $src1, $src2" %}
3400   ins_cost(150);
3401   ins_encode %{
3402     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3403   %}
3404   ins_pipe(pipe_slow);
3405 %}
3406 
3407 instruct divD_reg_imm(regD dst, regD src, immD con) %{
3408   predicate(UseAVX > 0);
3409   match(Set dst (DivD src con));
3410 
3411   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3412   ins_cost(150);
3413   ins_encode %{
3414     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3415   %}
3416   ins_pipe(pipe_slow);
3417 %}
3418 
3419 instruct absF_reg(regF dst) %{
3420   predicate((UseSSE>=1) && (UseAVX == 0));
3421   match(Set dst (AbsF dst));
3422   ins_cost(150);
3423   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
3424   ins_encode %{
3425     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
3426   %}
3427   ins_pipe(pipe_slow);
3428 %}
3429 
3430 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
3431   predicate(UseAVX > 0);
3432   match(Set dst (AbsF src));
3433   ins_cost(150);
3434   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
3435   ins_encode %{
3436     int vlen_enc = Assembler::AVX_128bit;
3437     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
3438               ExternalAddress(float_signmask()), vlen_enc);
3439   %}
3440   ins_pipe(pipe_slow);
3441 %}
3442 
3443 instruct absD_reg(regD dst) %{
3444   predicate((UseSSE>=2) && (UseAVX == 0));
3445   match(Set dst (AbsD dst));
3446   ins_cost(150);
3447   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
3448             "# abs double by sign masking" %}
3449   ins_encode %{
3450     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
3451   %}
3452   ins_pipe(pipe_slow);
3453 %}
3454 
3455 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
3456   predicate(UseAVX > 0);
3457   match(Set dst (AbsD src));
3458   ins_cost(150);
3459   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
3460             "# abs double by sign masking" %}
3461   ins_encode %{
3462     int vlen_enc = Assembler::AVX_128bit;
3463     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
3464               ExternalAddress(double_signmask()), vlen_enc);
3465   %}
3466   ins_pipe(pipe_slow);
3467 %}
3468 
3469 instruct negF_reg(regF dst) %{
3470   predicate((UseSSE>=1) && (UseAVX == 0));
3471   match(Set dst (NegF dst));
3472   ins_cost(150);
3473   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
3474   ins_encode %{
3475     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
3476   %}
3477   ins_pipe(pipe_slow);
3478 %}
3479 
3480 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
3481   predicate(UseAVX > 0);
3482   match(Set dst (NegF src));
3483   ins_cost(150);
3484   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
3485   ins_encode %{
3486     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
3487                  ExternalAddress(float_signflip()));
3488   %}
3489   ins_pipe(pipe_slow);
3490 %}
3491 
3492 instruct negD_reg(regD dst) %{
3493   predicate((UseSSE>=2) && (UseAVX == 0));
3494   match(Set dst (NegD dst));
3495   ins_cost(150);
3496   format %{ "xorpd   $dst, [0x8000000000000000]\t"
3497             "# neg double by sign flipping" %}
3498   ins_encode %{
3499     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
3500   %}
3501   ins_pipe(pipe_slow);
3502 %}
3503 
3504 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
3505   predicate(UseAVX > 0);
3506   match(Set dst (NegD src));
3507   ins_cost(150);
3508   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
3509             "# neg double by sign flipping" %}
3510   ins_encode %{
3511     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
3512                  ExternalAddress(double_signflip()));
3513   %}
3514   ins_pipe(pipe_slow);
3515 %}
3516 
3517 // sqrtss instruction needs destination register to be pre initialized for best performance
3518 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3519 instruct sqrtF_reg(regF dst) %{
3520   predicate(UseSSE>=1);
3521   match(Set dst (SqrtF dst));
3522   format %{ "sqrtss  $dst, $dst" %}
3523   ins_encode %{
3524     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
3525   %}
3526   ins_pipe(pipe_slow);
3527 %}
3528 
3529 // sqrtsd instruction needs destination register to be pre initialized for best performance
3530 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3531 instruct sqrtD_reg(regD dst) %{
3532   predicate(UseSSE>=2);
3533   match(Set dst (SqrtD dst));
3534   format %{ "sqrtsd  $dst, $dst" %}
3535   ins_encode %{
3536     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
3537   %}
3538   ins_pipe(pipe_slow);
3539 %}
3540 
3541 
3542 // ---------------------------------------- VectorReinterpret ------------------------------------
3543 instruct reinterpret_mask(kReg dst) %{
3544   predicate(n->bottom_type()->isa_vectmask() &&
3545             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
3546   match(Set dst (VectorReinterpret dst));
3547   ins_cost(125);
3548   format %{ "vector_reinterpret $dst\t!" %}
3549   ins_encode %{
3550     // empty
3551   %}
3552   ins_pipe( pipe_slow );
3553 %}
3554 
3555 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
3556   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3557             n->bottom_type()->isa_vectmask() &&
3558             n->in(1)->bottom_type()->isa_vectmask() &&
3559             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
3560             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3561   match(Set dst (VectorReinterpret src));
3562   effect(TEMP xtmp);
3563   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
3564   ins_encode %{
3565      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
3566      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3567      assert(src_sz == dst_sz , "src and dst size mismatch");
3568      int vlen_enc = vector_length_encoding(src_sz);
3569      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3570      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3571   %}
3572   ins_pipe( pipe_slow );
3573 %}
3574 
3575 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
3576   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3577             n->bottom_type()->isa_vectmask() &&
3578             n->in(1)->bottom_type()->isa_vectmask() &&
3579             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
3580              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
3581             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3582   match(Set dst (VectorReinterpret src));
3583   effect(TEMP xtmp);
3584   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
3585   ins_encode %{
3586      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
3587      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3588      assert(src_sz == dst_sz , "src and dst size mismatch");
3589      int vlen_enc = vector_length_encoding(src_sz);
3590      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3591      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3592   %}
3593   ins_pipe( pipe_slow );
3594 %}
3595 
3596 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
3597   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3598             n->bottom_type()->isa_vectmask() &&
3599             n->in(1)->bottom_type()->isa_vectmask() &&
3600             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
3601              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
3602             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3603   match(Set dst (VectorReinterpret src));
3604   effect(TEMP xtmp);
3605   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
3606   ins_encode %{
3607      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
3608      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3609      assert(src_sz == dst_sz , "src and dst size mismatch");
3610      int vlen_enc = vector_length_encoding(src_sz);
3611      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3612      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3613   %}
3614   ins_pipe( pipe_slow );
3615 %}
3616 
3617 instruct reinterpret(vec dst) %{
3618   predicate(!n->bottom_type()->isa_vectmask() &&
3619             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
3620   match(Set dst (VectorReinterpret dst));
3621   ins_cost(125);
3622   format %{ "vector_reinterpret $dst\t!" %}
3623   ins_encode %{
3624     // empty
3625   %}
3626   ins_pipe( pipe_slow );
3627 %}
3628 
3629 instruct reinterpret_expand(vec dst, vec src, rRegP scratch) %{
3630   predicate(UseAVX == 0 &&
3631             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3632   match(Set dst (VectorReinterpret src));
3633   ins_cost(125);
3634   effect(TEMP dst, TEMP scratch);
3635   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3636   ins_encode %{
3637     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
3638     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
3639 
3640     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
3641     if (src_vlen_in_bytes == 4) {
3642       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3643     } else {
3644       assert(src_vlen_in_bytes == 8, "");
3645       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), $scratch$$Register);
3646     }
3647     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3648   %}
3649   ins_pipe( pipe_slow );
3650 %}
3651 
3652 instruct vreinterpret_expand4(legVec dst, vec src, rRegP scratch) %{
3653   predicate(UseAVX > 0 &&
3654             !n->bottom_type()->isa_vectmask() &&
3655             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
3656             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3657   match(Set dst (VectorReinterpret src));
3658   ins_cost(125);
3659   effect(TEMP scratch);
3660   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3661   ins_encode %{
3662     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, $scratch$$Register);
3663   %}
3664   ins_pipe( pipe_slow );
3665 %}
3666 
3667 
3668 instruct vreinterpret_expand(legVec dst, vec src) %{
3669   predicate(UseAVX > 0 &&
3670             !n->bottom_type()->isa_vectmask() &&
3671             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
3672             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3673   match(Set dst (VectorReinterpret src));
3674   ins_cost(125);
3675   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
3676   ins_encode %{
3677     switch (Matcher::vector_length_in_bytes(this, $src)) {
3678       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3679       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3680       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3681       default: ShouldNotReachHere();
3682     }
3683   %}
3684   ins_pipe( pipe_slow );
3685 %}
3686 
3687 instruct reinterpret_shrink(vec dst, legVec src) %{
3688   predicate(!n->bottom_type()->isa_vectmask() &&
3689             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
3690   match(Set dst (VectorReinterpret src));
3691   ins_cost(125);
3692   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
3693   ins_encode %{
3694     switch (Matcher::vector_length_in_bytes(this)) {
3695       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
3696       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3697       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3698       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3699       default: ShouldNotReachHere();
3700     }
3701   %}
3702   ins_pipe( pipe_slow );
3703 %}
3704 
3705 // ----------------------------------------------------------------------------------------------------
3706 
3707 #ifdef _LP64
3708 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
3709   match(Set dst (RoundDoubleMode src rmode));
3710   format %{ "roundsd $dst,$src" %}
3711   ins_cost(150);
3712   ins_encode %{
3713     assert(UseSSE >= 4, "required");
3714     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
3715   %}
3716   ins_pipe(pipe_slow);
3717 %}
3718 
3719 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
3720   match(Set dst (RoundDoubleMode (LoadD src) rmode));
3721   format %{ "roundsd $dst,$src" %}
3722   ins_cost(150);
3723   ins_encode %{
3724     assert(UseSSE >= 4, "required");
3725     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
3726   %}
3727   ins_pipe(pipe_slow);
3728 %}
3729 
3730 instruct roundD_imm(legRegD dst, immD con, immU8 rmode, rRegI scratch_reg) %{
3731   match(Set dst (RoundDoubleMode con rmode));
3732   effect(TEMP scratch_reg);
3733   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
3734   ins_cost(150);
3735   ins_encode %{
3736     assert(UseSSE >= 4, "required");
3737     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, $scratch_reg$$Register);
3738   %}
3739   ins_pipe(pipe_slow);
3740 %}
3741 
3742 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
3743   predicate(Matcher::vector_length(n) < 8);
3744   match(Set dst (RoundDoubleModeV src rmode));
3745   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
3746   ins_encode %{
3747     assert(UseAVX > 0, "required");
3748     int vlen_enc = vector_length_encoding(this);
3749     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
3750   %}
3751   ins_pipe( pipe_slow );
3752 %}
3753 
3754 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
3755   predicate(Matcher::vector_length(n) == 8);
3756   match(Set dst (RoundDoubleModeV src rmode));
3757   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
3758   ins_encode %{
3759     assert(UseAVX > 2, "required");
3760     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
3761   %}
3762   ins_pipe( pipe_slow );
3763 %}
3764 
3765 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
3766   predicate(Matcher::vector_length(n) < 8);
3767   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3768   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
3769   ins_encode %{
3770     assert(UseAVX > 0, "required");
3771     int vlen_enc = vector_length_encoding(this);
3772     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
3773   %}
3774   ins_pipe( pipe_slow );
3775 %}
3776 
3777 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
3778   predicate(Matcher::vector_length(n) == 8);
3779   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3780   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
3781   ins_encode %{
3782     assert(UseAVX > 2, "required");
3783     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
3784   %}
3785   ins_pipe( pipe_slow );
3786 %}
3787 #endif // _LP64
3788 
3789 instruct onspinwait() %{
3790   match(OnSpinWait);
3791   ins_cost(200);
3792 
3793   format %{
3794     $$template
3795     $$emit$$"pause\t! membar_onspinwait"
3796   %}
3797   ins_encode %{
3798     __ pause();
3799   %}
3800   ins_pipe(pipe_slow);
3801 %}
3802 
3803 // a * b + c
3804 instruct fmaD_reg(regD a, regD b, regD c) %{
3805   predicate(UseFMA);
3806   match(Set c (FmaD  c (Binary a b)));
3807   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
3808   ins_cost(150);
3809   ins_encode %{
3810     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3811   %}
3812   ins_pipe( pipe_slow );
3813 %}
3814 
3815 // a * b + c
3816 instruct fmaF_reg(regF a, regF b, regF c) %{
3817   predicate(UseFMA);
3818   match(Set c (FmaF  c (Binary a b)));
3819   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
3820   ins_cost(150);
3821   ins_encode %{
3822     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3823   %}
3824   ins_pipe( pipe_slow );
3825 %}
3826 
3827 // ====================VECTOR INSTRUCTIONS=====================================
3828 
3829 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
3830 instruct MoveVec2Leg(legVec dst, vec src) %{
3831   match(Set dst src);
3832   format %{ "" %}
3833   ins_encode %{
3834     ShouldNotReachHere();
3835   %}
3836   ins_pipe( fpu_reg_reg );
3837 %}
3838 
3839 instruct MoveLeg2Vec(vec dst, legVec src) %{
3840   match(Set dst src);
3841   format %{ "" %}
3842   ins_encode %{
3843     ShouldNotReachHere();
3844   %}
3845   ins_pipe( fpu_reg_reg );
3846 %}
3847 
3848 // ============================================================================
3849 
3850 // Load vectors generic operand pattern
3851 instruct loadV(vec dst, memory mem) %{
3852   match(Set dst (LoadVector mem));
3853   ins_cost(125);
3854   format %{ "load_vector $dst,$mem" %}
3855   ins_encode %{
3856     __ load_vector($dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
3857   %}
3858   ins_pipe( pipe_slow );
3859 %}
3860 
3861 // Store vectors generic operand pattern.
3862 instruct storeV(memory mem, vec src) %{
3863   match(Set mem (StoreVector mem src));
3864   ins_cost(145);
3865   format %{ "store_vector $mem,$src\n\t" %}
3866   ins_encode %{
3867     switch (Matcher::vector_length_in_bytes(this, $src)) {
3868       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
3869       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
3870       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
3871       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
3872       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
3873       default: ShouldNotReachHere();
3874     }
3875   %}
3876   ins_pipe( pipe_slow );
3877 %}
3878 
3879 // ---------------------------------------- Gather ------------------------------------
3880 
3881 // Gather INT, LONG, FLOAT, DOUBLE
3882 
3883 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
3884   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
3885   match(Set dst (LoadVectorGather mem idx));
3886   effect(TEMP dst, TEMP tmp, TEMP mask);
3887   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
3888   ins_encode %{
3889     assert(UseAVX >= 2, "sanity");
3890 
3891     int vlen_enc = vector_length_encoding(this);
3892     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3893 
3894     assert(Matcher::vector_length_in_bytes(this) >= 16, "sanity");
3895     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3896 
3897     if (vlen_enc == Assembler::AVX_128bit) {
3898       __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3899     } else {
3900       __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3901     }
3902     __ lea($tmp$$Register, $mem$$Address);
3903     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
3904   %}
3905   ins_pipe( pipe_slow );
3906 %}
3907 
3908 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
3909   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
3910   match(Set dst (LoadVectorGather mem idx));
3911   effect(TEMP dst, TEMP tmp, TEMP ktmp);
3912   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
3913   ins_encode %{
3914     assert(UseAVX > 2, "sanity");
3915 
3916     int vlen_enc = vector_length_encoding(this);
3917     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3918 
3919     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3920 
3921     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3922     __ lea($tmp$$Register, $mem$$Address);
3923     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
3924   %}
3925   ins_pipe( pipe_slow );
3926 %}
3927 
3928 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
3929   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
3930   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
3931   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
3932   ins_encode %{
3933     assert(UseAVX > 2, "sanity");
3934     int vlen_enc = vector_length_encoding(this);
3935     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3936     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3937     // Note: Since gather instruction partially updates the opmask register used
3938     // for predication hense moving mask operand to a temporary.
3939     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
3940     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3941     __ lea($tmp$$Register, $mem$$Address);
3942     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
3943   %}
3944   ins_pipe( pipe_slow );
3945 %}
3946 // ====================Scatter=======================================
3947 
3948 // Scatter INT, LONG, FLOAT, DOUBLE
3949 
3950 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
3951   predicate(UseAVX > 2);
3952   match(Set mem (StoreVectorScatter mem (Binary src idx)));
3953   effect(TEMP tmp, TEMP ktmp);
3954   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
3955   ins_encode %{
3956     int vlen_enc = vector_length_encoding(this, $src);
3957     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
3958 
3959     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
3960     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3961 
3962     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3963     __ lea($tmp$$Register, $mem$$Address);
3964     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
3965   %}
3966   ins_pipe( pipe_slow );
3967 %}
3968 
3969 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
3970   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
3971   effect(TEMP tmp, TEMP ktmp);
3972   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
3973   ins_encode %{
3974     int vlen_enc = vector_length_encoding(this, $src);
3975     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
3976     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
3977     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3978     // Note: Since scatter instruction partially updates the opmask register used
3979     // for predication hense moving mask operand to a temporary.
3980     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
3981     __ lea($tmp$$Register, $mem$$Address);
3982     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
3983   %}
3984   ins_pipe( pipe_slow );
3985 %}
3986 
3987 // ====================REPLICATE=======================================
3988 
3989 // Replicate byte scalar to be vector
3990 instruct ReplB_reg(vec dst, rRegI src) %{
3991   match(Set dst (ReplicateB src));
3992   format %{ "replicateB $dst,$src" %}
3993   ins_encode %{
3994     uint vlen = Matcher::vector_length(this);
3995     if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
3996       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
3997       int vlen_enc = vector_length_encoding(this);
3998       __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
3999     } else if (VM_Version::supports_avx2()) {
4000       int vlen_enc = vector_length_encoding(this);
4001       __ movdl($dst$$XMMRegister, $src$$Register);
4002       __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4003     } else {
4004       __ movdl($dst$$XMMRegister, $src$$Register);
4005       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
4006       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4007       if (vlen >= 16) {
4008         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4009         if (vlen >= 32) {
4010           assert(vlen == 32, "sanity");
4011           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4012         }
4013       }
4014     }
4015   %}
4016   ins_pipe( pipe_slow );
4017 %}
4018 
4019 instruct ReplB_mem(vec dst, memory mem) %{
4020   predicate(VM_Version::supports_avx2());
4021   match(Set dst (ReplicateB (LoadB mem)));
4022   format %{ "replicateB $dst,$mem" %}
4023   ins_encode %{
4024     int vlen_enc = vector_length_encoding(this);
4025     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
4026   %}
4027   ins_pipe( pipe_slow );
4028 %}
4029 
4030 instruct ReplB_imm(vec dst, immI con) %{
4031   match(Set dst (ReplicateB con));
4032   format %{ "replicateB $dst,$con" %}
4033   ins_encode %{
4034     InternalAddress addr = $constantaddress(T_BYTE, vreplicate_imm(T_BYTE, $con$$constant, Matcher::vector_length(this)));
4035     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
4036   %}
4037   ins_pipe( pipe_slow );
4038 %}
4039 
4040 // ====================ReplicateS=======================================
4041 
4042 instruct ReplS_reg(vec dst, rRegI src) %{
4043   match(Set dst (ReplicateS src));
4044   format %{ "replicateS $dst,$src" %}
4045   ins_encode %{
4046     uint vlen = Matcher::vector_length(this);
4047     if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
4048       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
4049       int vlen_enc = vector_length_encoding(this);
4050       __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
4051     } else if (VM_Version::supports_avx2()) {
4052       int vlen_enc = vector_length_encoding(this);
4053       __ movdl($dst$$XMMRegister, $src$$Register);
4054       __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4055     } else {
4056       __ movdl($dst$$XMMRegister, $src$$Register);
4057       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4058       if (vlen >= 8) {
4059         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4060         if (vlen >= 16) {
4061           assert(vlen == 16, "sanity");
4062           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4063         }
4064       }
4065     }
4066   %}
4067   ins_pipe( pipe_slow );
4068 %}
4069 
4070 instruct ReplS_mem(vec dst, memory mem) %{
4071   predicate(VM_Version::supports_avx2());
4072   match(Set dst (ReplicateS (LoadS mem)));
4073   format %{ "replicateS $dst,$mem" %}
4074   ins_encode %{
4075     int vlen_enc = vector_length_encoding(this);
4076     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
4077   %}
4078   ins_pipe( pipe_slow );
4079 %}
4080 
4081 instruct ReplS_imm(vec dst, immI con) %{
4082   match(Set dst (ReplicateS con));
4083   format %{ "replicateS $dst,$con" %}
4084   ins_encode %{
4085     InternalAddress addr = $constantaddress(T_SHORT, vreplicate_imm(T_SHORT, $con$$constant, Matcher::vector_length(this)));
4086     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
4087   %}
4088   ins_pipe( pipe_slow );
4089 %}
4090 
4091 // ====================ReplicateI=======================================
4092 
4093 instruct ReplI_reg(vec dst, rRegI src) %{
4094   match(Set dst (ReplicateI src));
4095   format %{ "replicateI $dst,$src" %}
4096   ins_encode %{
4097     uint vlen = Matcher::vector_length(this);
4098     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4099       int vlen_enc = vector_length_encoding(this);
4100       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
4101     } else if (VM_Version::supports_avx2()) {
4102       int vlen_enc = vector_length_encoding(this);
4103       __ movdl($dst$$XMMRegister, $src$$Register);
4104       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4105     } else {
4106       __ movdl($dst$$XMMRegister, $src$$Register);
4107       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4108       if (vlen >= 8) {
4109         assert(vlen == 8, "sanity");
4110         __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4111       }
4112     }
4113   %}
4114   ins_pipe( pipe_slow );
4115 %}
4116 
4117 instruct ReplI_mem(vec dst, memory mem) %{
4118   match(Set dst (ReplicateI (LoadI mem)));
4119   format %{ "replicateI $dst,$mem" %}
4120   ins_encode %{
4121     uint vlen = Matcher::vector_length(this);
4122     if (vlen <= 4) {
4123       __ movdl($dst$$XMMRegister, $mem$$Address);
4124       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4125     } else {
4126       assert(VM_Version::supports_avx2(), "sanity");
4127       int vlen_enc = vector_length_encoding(this);
4128       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4129     }
4130   %}
4131   ins_pipe( pipe_slow );
4132 %}
4133 
4134 instruct ReplI_imm(vec dst, immI con) %{
4135   match(Set dst (ReplicateI con));
4136   format %{ "replicateI $dst,$con" %}
4137   ins_encode %{
4138     InternalAddress addr = $constantaddress(T_INT, vreplicate_imm(T_INT, $con$$constant, Matcher::vector_length(this)));
4139     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
4140   %}
4141   ins_pipe( pipe_slow );
4142 %}
4143 
4144 // Replicate scalar zero to be vector
4145 instruct ReplI_zero(vec dst, immI_0 zero) %{
4146   match(Set dst (ReplicateB zero));
4147   match(Set dst (ReplicateS zero));
4148   match(Set dst (ReplicateI zero));
4149   format %{ "replicateI $dst,$zero" %}
4150   ins_encode %{
4151     uint vsize = Matcher::vector_length_in_bytes(this);
4152     if (vsize <= 16) {
4153       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4154     } else {
4155       int vlen_enc = vector_length_encoding(this);
4156       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4157     }
4158   %}
4159   ins_pipe( fpu_reg_reg );
4160 %}
4161 
4162 instruct ReplI_M1(vec dst, immI_M1 con) %{
4163   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) >= 16);
4164   match(Set dst (ReplicateB con));
4165   match(Set dst (ReplicateS con));
4166   match(Set dst (ReplicateI con));
4167   effect(TEMP dst);
4168   format %{ "vallones $dst" %}
4169   ins_encode %{
4170     int vector_len = vector_length_encoding(this);
4171     __ vallones($dst$$XMMRegister, vector_len);
4172   %}
4173   ins_pipe( pipe_slow );
4174 %}
4175 
4176 // ====================ReplicateL=======================================
4177 
4178 #ifdef _LP64
4179 // Replicate long (8 byte) scalar to be vector
4180 instruct ReplL_reg(vec dst, rRegL src) %{
4181   match(Set dst (ReplicateL src));
4182   format %{ "replicateL $dst,$src" %}
4183   ins_encode %{
4184     uint vlen = Matcher::vector_length(this);
4185     if (vlen == 2) {
4186       __ movdq($dst$$XMMRegister, $src$$Register);
4187       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4188     } else if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4189       int vlen_enc = vector_length_encoding(this);
4190       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
4191     } else if (VM_Version::supports_avx2()) {
4192       assert(vlen == 4, "sanity");
4193       int vlen_enc = vector_length_encoding(this);
4194       __ movdq($dst$$XMMRegister, $src$$Register);
4195       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4196     } else {
4197       assert(vlen == 4, "sanity");
4198       __ movdq($dst$$XMMRegister, $src$$Register);
4199       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4200       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4201     }
4202   %}
4203   ins_pipe( pipe_slow );
4204 %}
4205 #else // _LP64
4206 // Replicate long (8 byte) scalar to be vector
4207 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
4208   predicate(Matcher::vector_length(n) <= 4);
4209   match(Set dst (ReplicateL src));
4210   effect(TEMP dst, USE src, TEMP tmp);
4211   format %{ "replicateL $dst,$src" %}
4212   ins_encode %{
4213     uint vlen = Matcher::vector_length(this);
4214     if (vlen == 2) {
4215       __ movdl($dst$$XMMRegister, $src$$Register);
4216       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4217       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4218       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4219     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4220       int vlen_enc = Assembler::AVX_256bit;
4221       __ movdl($dst$$XMMRegister, $src$$Register);
4222       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4223       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4224       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4225     } else {
4226       __ movdl($dst$$XMMRegister, $src$$Register);
4227       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4228       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4229       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4230       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4231     }
4232   %}
4233   ins_pipe( pipe_slow );
4234 %}
4235 
4236 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
4237   predicate(Matcher::vector_length(n) == 8);
4238   match(Set dst (ReplicateL src));
4239   effect(TEMP dst, USE src, TEMP tmp);
4240   format %{ "replicateL $dst,$src" %}
4241   ins_encode %{
4242     if (VM_Version::supports_avx512vl()) {
4243       __ movdl($dst$$XMMRegister, $src$$Register);
4244       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4245       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4246       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4247       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4248       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4249     } else {
4250       int vlen_enc = Assembler::AVX_512bit;
4251       __ movdl($dst$$XMMRegister, $src$$Register);
4252       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4253       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4254       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4255     }
4256   %}
4257   ins_pipe( pipe_slow );
4258 %}
4259 #endif // _LP64
4260 
4261 instruct ReplL_mem(vec dst, memory mem) %{
4262   match(Set dst (ReplicateL (LoadL mem)));
4263   format %{ "replicateL $dst,$mem" %}
4264   ins_encode %{
4265     uint vlen = Matcher::vector_length(this);
4266     if (vlen == 2) {
4267       __ movq($dst$$XMMRegister, $mem$$Address);
4268       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4269     } else {
4270       assert(VM_Version::supports_avx2(), "sanity");
4271       int vlen_enc = vector_length_encoding(this);
4272       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
4273     }
4274   %}
4275   ins_pipe( pipe_slow );
4276 %}
4277 
4278 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4279 instruct ReplL_imm(vec dst, immL con) %{
4280   match(Set dst (ReplicateL con));
4281   format %{ "replicateL $dst,$con" %}
4282   ins_encode %{
4283     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, Matcher::vector_length(this)));
4284     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
4285   %}
4286   ins_pipe( pipe_slow );
4287 %}
4288 
4289 instruct ReplL_zero(vec dst, immL0 zero) %{
4290   match(Set dst (ReplicateL zero));
4291   format %{ "replicateL $dst,$zero" %}
4292   ins_encode %{
4293     int vlen = Matcher::vector_length(this);
4294     if (vlen == 2) {
4295       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4296     } else {
4297       int vlen_enc = vector_length_encoding(this);
4298       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4299     }
4300   %}
4301   ins_pipe( fpu_reg_reg );
4302 %}
4303 
4304 instruct ReplL_M1(vec dst, immL_M1 con) %{
4305   predicate(UseAVX > 0);
4306   match(Set dst (ReplicateL con));
4307   effect(TEMP dst);
4308   format %{ "vallones $dst" %}
4309   ins_encode %{
4310     int vector_len = vector_length_encoding(this);
4311     __ vallones($dst$$XMMRegister, vector_len);
4312   %}
4313   ins_pipe( pipe_slow );
4314 %}
4315 
4316 // ====================ReplicateF=======================================
4317 
4318 instruct ReplF_reg(vec dst, vlRegF src) %{
4319   match(Set dst (ReplicateF src));
4320   format %{ "replicateF $dst,$src" %}
4321   ins_encode %{
4322     uint vlen = Matcher::vector_length(this);
4323     if (vlen <= 4) {
4324       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4325    } else if (VM_Version::supports_avx2()) {
4326       int vlen_enc = vector_length_encoding(this);
4327       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4328     } else {
4329       assert(vlen == 8, "sanity");
4330       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4331       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4332     }
4333   %}
4334   ins_pipe( pipe_slow );
4335 %}
4336 
4337 instruct ReplF_mem(vec dst, memory mem) %{
4338   match(Set dst (ReplicateF (LoadF mem)));
4339   format %{ "replicateF $dst,$mem" %}
4340   ins_encode %{
4341     uint vlen = Matcher::vector_length(this);
4342     if (vlen <= 4) {
4343       __ movdl($dst$$XMMRegister, $mem$$Address);
4344       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4345     } else {
4346       assert(VM_Version::supports_avx(), "sanity");
4347       int vlen_enc = vector_length_encoding(this);
4348       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
4349     }
4350   %}
4351   ins_pipe( pipe_slow );
4352 %}
4353 
4354 // Replicate float scalar immediate to be vector by loading from const table.
4355 instruct ReplF_imm(vec dst, immF con) %{
4356   match(Set dst (ReplicateF con));
4357   format %{ "replicateF $dst,$con" %}
4358   ins_encode %{
4359     InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant, Matcher::vector_length(this)));
4360     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
4361   %}
4362   ins_pipe( pipe_slow );
4363 %}
4364 
4365 instruct ReplF_zero(vec dst, immF0 zero) %{
4366   match(Set dst (ReplicateF zero));
4367   format %{ "replicateF $dst,$zero" %}
4368   ins_encode %{
4369     uint vlen = Matcher::vector_length(this);
4370     if (vlen <= 4) {
4371       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4372     } else {
4373       int vlen_enc = vector_length_encoding(this);
4374       __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4375     }
4376   %}
4377   ins_pipe( fpu_reg_reg );
4378 %}
4379 
4380 // ====================ReplicateD=======================================
4381 
4382 // Replicate double (8 bytes) scalar to be vector
4383 instruct ReplD_reg(vec dst, vlRegD src) %{
4384   match(Set dst (ReplicateD src));
4385   format %{ "replicateD $dst,$src" %}
4386   ins_encode %{
4387     uint vlen = Matcher::vector_length(this);
4388     if (vlen == 2) {
4389       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4390     } else if (VM_Version::supports_avx2()) {
4391       int vlen_enc = vector_length_encoding(this);
4392       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4393     } else {
4394       assert(vlen == 4, "sanity");
4395       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4396       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4397     }
4398   %}
4399   ins_pipe( pipe_slow );
4400 %}
4401 
4402 instruct ReplD_mem(vec dst, memory mem) %{
4403   match(Set dst (ReplicateD (LoadD mem)));
4404   format %{ "replicateD $dst,$mem" %}
4405   ins_encode %{
4406     uint vlen = Matcher::vector_length(this);
4407     if (vlen == 2) {
4408       __ movq($dst$$XMMRegister, $mem$$Address);
4409       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44);
4410     } else {
4411       assert(VM_Version::supports_avx(), "sanity");
4412       int vlen_enc = vector_length_encoding(this);
4413       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4414     }
4415   %}
4416   ins_pipe( pipe_slow );
4417 %}
4418 
4419 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
4420 instruct ReplD_imm(vec dst, immD con) %{
4421   match(Set dst (ReplicateD con));
4422   format %{ "replicateD $dst,$con" %}
4423   ins_encode %{
4424     InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, Matcher::vector_length(this)));
4425     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
4426   %}
4427   ins_pipe( pipe_slow );
4428 %}
4429 
4430 instruct ReplD_zero(vec dst, immD0 zero) %{
4431   match(Set dst (ReplicateD zero));
4432   format %{ "replicateD $dst,$zero" %}
4433   ins_encode %{
4434     uint vlen = Matcher::vector_length(this);
4435     if (vlen == 2) {
4436       __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4437     } else {
4438       int vlen_enc = vector_length_encoding(this);
4439       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4440     }
4441   %}
4442   ins_pipe( fpu_reg_reg );
4443 %}
4444 
4445 // ====================VECTOR INSERT=======================================
4446 
4447 instruct insert(vec dst, rRegI val, immU8 idx) %{
4448   predicate(Matcher::vector_length_in_bytes(n) < 32);
4449   match(Set dst (VectorInsert (Binary dst val) idx));
4450   format %{ "vector_insert $dst,$val,$idx" %}
4451   ins_encode %{
4452     assert(UseSSE >= 4, "required");
4453     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
4454 
4455     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4456 
4457     assert(is_integral_type(elem_bt), "");
4458     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4459 
4460     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
4461   %}
4462   ins_pipe( pipe_slow );
4463 %}
4464 
4465 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
4466   predicate(Matcher::vector_length_in_bytes(n) == 32);
4467   match(Set dst (VectorInsert (Binary src val) idx));
4468   effect(TEMP vtmp);
4469   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4470   ins_encode %{
4471     int vlen_enc = Assembler::AVX_256bit;
4472     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4473     int elem_per_lane = 16/type2aelembytes(elem_bt);
4474     int log2epr = log2(elem_per_lane);
4475 
4476     assert(is_integral_type(elem_bt), "sanity");
4477     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4478 
4479     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4480     uint y_idx = ($idx$$constant >> log2epr) & 1;
4481     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4482     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4483     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4484   %}
4485   ins_pipe( pipe_slow );
4486 %}
4487 
4488 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
4489   predicate(Matcher::vector_length_in_bytes(n) == 64);
4490   match(Set dst (VectorInsert (Binary src val) idx));
4491   effect(TEMP vtmp);
4492   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4493   ins_encode %{
4494     assert(UseAVX > 2, "sanity");
4495 
4496     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4497     int elem_per_lane = 16/type2aelembytes(elem_bt);
4498     int log2epr = log2(elem_per_lane);
4499 
4500     assert(is_integral_type(elem_bt), "");
4501     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4502 
4503     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4504     uint y_idx = ($idx$$constant >> log2epr) & 3;
4505     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4506     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4507     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4508   %}
4509   ins_pipe( pipe_slow );
4510 %}
4511 
4512 #ifdef _LP64
4513 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
4514   predicate(Matcher::vector_length(n) == 2);
4515   match(Set dst (VectorInsert (Binary dst val) idx));
4516   format %{ "vector_insert $dst,$val,$idx" %}
4517   ins_encode %{
4518     assert(UseSSE >= 4, "required");
4519     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4520     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4521 
4522     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
4523   %}
4524   ins_pipe( pipe_slow );
4525 %}
4526 
4527 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
4528   predicate(Matcher::vector_length(n) == 4);
4529   match(Set dst (VectorInsert (Binary src val) idx));
4530   effect(TEMP vtmp);
4531   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4532   ins_encode %{
4533     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4534     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4535 
4536     uint x_idx = $idx$$constant & right_n_bits(1);
4537     uint y_idx = ($idx$$constant >> 1) & 1;
4538     int vlen_enc = Assembler::AVX_256bit;
4539     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4540     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4541     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4542   %}
4543   ins_pipe( pipe_slow );
4544 %}
4545 
4546 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
4547   predicate(Matcher::vector_length(n) == 8);
4548   match(Set dst (VectorInsert (Binary src val) idx));
4549   effect(TEMP vtmp);
4550   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4551   ins_encode %{
4552     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
4553     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4554 
4555     uint x_idx = $idx$$constant & right_n_bits(1);
4556     uint y_idx = ($idx$$constant >> 1) & 3;
4557     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4558     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4559     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4560   %}
4561   ins_pipe( pipe_slow );
4562 %}
4563 #endif
4564 
4565 instruct insertF(vec dst, regF val, immU8 idx) %{
4566   predicate(Matcher::vector_length(n) < 8);
4567   match(Set dst (VectorInsert (Binary dst val) idx));
4568   format %{ "vector_insert $dst,$val,$idx" %}
4569   ins_encode %{
4570     assert(UseSSE >= 4, "sanity");
4571 
4572     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4573     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4574 
4575     uint x_idx = $idx$$constant & right_n_bits(2);
4576     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
4577   %}
4578   ins_pipe( pipe_slow );
4579 %}
4580 
4581 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
4582   predicate(Matcher::vector_length(n) >= 8);
4583   match(Set dst (VectorInsert (Binary src val) idx));
4584   effect(TEMP vtmp);
4585   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4586   ins_encode %{
4587     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4588     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4589 
4590     int vlen = Matcher::vector_length(this);
4591     uint x_idx = $idx$$constant & right_n_bits(2);
4592     if (vlen == 8) {
4593       uint y_idx = ($idx$$constant >> 2) & 1;
4594       int vlen_enc = Assembler::AVX_256bit;
4595       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4596       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
4597       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4598     } else {
4599       assert(vlen == 16, "sanity");
4600       uint y_idx = ($idx$$constant >> 2) & 3;
4601       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4602       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
4603       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4604     }
4605   %}
4606   ins_pipe( pipe_slow );
4607 %}
4608 
4609 #ifdef _LP64
4610 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
4611   predicate(Matcher::vector_length(n) == 2);
4612   match(Set dst (VectorInsert (Binary dst val) idx));
4613   effect(TEMP tmp);
4614   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
4615   ins_encode %{
4616     assert(UseSSE >= 4, "sanity");
4617     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4618     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4619 
4620     __ movq($tmp$$Register, $val$$XMMRegister);
4621     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
4622   %}
4623   ins_pipe( pipe_slow );
4624 %}
4625 
4626 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
4627   predicate(Matcher::vector_length(n) == 4);
4628   match(Set dst (VectorInsert (Binary src val) idx));
4629   effect(TEMP vtmp, TEMP tmp);
4630   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
4631   ins_encode %{
4632     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4633     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4634 
4635     uint x_idx = $idx$$constant & right_n_bits(1);
4636     uint y_idx = ($idx$$constant >> 1) & 1;
4637     int vlen_enc = Assembler::AVX_256bit;
4638     __ movq($tmp$$Register, $val$$XMMRegister);
4639     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4640     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4641     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4642   %}
4643   ins_pipe( pipe_slow );
4644 %}
4645 
4646 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
4647   predicate(Matcher::vector_length(n) == 8);
4648   match(Set dst (VectorInsert (Binary src val) idx));
4649   effect(TEMP tmp, TEMP vtmp);
4650   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4651   ins_encode %{
4652     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4653     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4654 
4655     uint x_idx = $idx$$constant & right_n_bits(1);
4656     uint y_idx = ($idx$$constant >> 1) & 3;
4657     __ movq($tmp$$Register, $val$$XMMRegister);
4658     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4659     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4660     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4661   %}
4662   ins_pipe( pipe_slow );
4663 %}
4664 #endif
4665 
4666 // ====================REDUCTION ARITHMETIC=======================================
4667 
4668 // =======================Int Reduction==========================================
4669 
4670 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4671   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
4672   match(Set dst (AddReductionVI src1 src2));
4673   match(Set dst (MulReductionVI src1 src2));
4674   match(Set dst (AndReductionV  src1 src2));
4675   match(Set dst ( OrReductionV  src1 src2));
4676   match(Set dst (XorReductionV  src1 src2));
4677   match(Set dst (MinReductionV  src1 src2));
4678   match(Set dst (MaxReductionV  src1 src2));
4679   effect(TEMP vtmp1, TEMP vtmp2);
4680   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4681   ins_encode %{
4682     int opcode = this->ideal_Opcode();
4683     int vlen = Matcher::vector_length(this, $src2);
4684     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4685   %}
4686   ins_pipe( pipe_slow );
4687 %}
4688 
4689 // =======================Long Reduction==========================================
4690 
4691 #ifdef _LP64
4692 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4693   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
4694   match(Set dst (AddReductionVL src1 src2));
4695   match(Set dst (MulReductionVL src1 src2));
4696   match(Set dst (AndReductionV  src1 src2));
4697   match(Set dst ( OrReductionV  src1 src2));
4698   match(Set dst (XorReductionV  src1 src2));
4699   match(Set dst (MinReductionV  src1 src2));
4700   match(Set dst (MaxReductionV  src1 src2));
4701   effect(TEMP vtmp1, TEMP vtmp2);
4702   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4703   ins_encode %{
4704     int opcode = this->ideal_Opcode();
4705     int vlen = Matcher::vector_length(this, $src2);
4706     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4707   %}
4708   ins_pipe( pipe_slow );
4709 %}
4710 
4711 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
4712   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
4713   match(Set dst (AddReductionVL src1 src2));
4714   match(Set dst (MulReductionVL src1 src2));
4715   match(Set dst (AndReductionV  src1 src2));
4716   match(Set dst ( OrReductionV  src1 src2));
4717   match(Set dst (XorReductionV  src1 src2));
4718   match(Set dst (MinReductionV  src1 src2));
4719   match(Set dst (MaxReductionV  src1 src2));
4720   effect(TEMP vtmp1, TEMP vtmp2);
4721   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4722   ins_encode %{
4723     int opcode = this->ideal_Opcode();
4724     int vlen = Matcher::vector_length(this, $src2);
4725     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4726   %}
4727   ins_pipe( pipe_slow );
4728 %}
4729 #endif // _LP64
4730 
4731 // =======================Float Reduction==========================================
4732 
4733 instruct reductionF128(regF dst, vec src, vec vtmp) %{
4734   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
4735   match(Set dst (AddReductionVF dst src));
4736   match(Set dst (MulReductionVF dst src));
4737   effect(TEMP dst, TEMP vtmp);
4738   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
4739   ins_encode %{
4740     int opcode = this->ideal_Opcode();
4741     int vlen = Matcher::vector_length(this, $src);
4742     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4743   %}
4744   ins_pipe( pipe_slow );
4745 %}
4746 
4747 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
4748   predicate(Matcher::vector_length(n->in(2)) == 8); // src
4749   match(Set dst (AddReductionVF dst src));
4750   match(Set dst (MulReductionVF dst src));
4751   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4752   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4753   ins_encode %{
4754     int opcode = this->ideal_Opcode();
4755     int vlen = Matcher::vector_length(this, $src);
4756     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4757   %}
4758   ins_pipe( pipe_slow );
4759 %}
4760 
4761 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4762   predicate(Matcher::vector_length(n->in(2)) == 16); // src
4763   match(Set dst (AddReductionVF dst src));
4764   match(Set dst (MulReductionVF dst src));
4765   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4766   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4767   ins_encode %{
4768     int opcode = this->ideal_Opcode();
4769     int vlen = Matcher::vector_length(this, $src);
4770     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4771   %}
4772   ins_pipe( pipe_slow );
4773 %}
4774 
4775 // =======================Double Reduction==========================================
4776 
4777 instruct reduction2D(regD dst, vec src, vec vtmp) %{
4778   predicate(Matcher::vector_length(n->in(2)) == 2); // src
4779   match(Set dst (AddReductionVD dst src));
4780   match(Set dst (MulReductionVD dst src));
4781   effect(TEMP dst, TEMP vtmp);
4782   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
4783   ins_encode %{
4784     int opcode = this->ideal_Opcode();
4785     int vlen = Matcher::vector_length(this, $src);
4786     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4787 %}
4788   ins_pipe( pipe_slow );
4789 %}
4790 
4791 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
4792   predicate(Matcher::vector_length(n->in(2)) == 4); // src
4793   match(Set dst (AddReductionVD dst src));
4794   match(Set dst (MulReductionVD dst src));
4795   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4796   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4797   ins_encode %{
4798     int opcode = this->ideal_Opcode();
4799     int vlen = Matcher::vector_length(this, $src);
4800     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4801   %}
4802   ins_pipe( pipe_slow );
4803 %}
4804 
4805 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4806   predicate(Matcher::vector_length(n->in(2)) == 8); // src
4807   match(Set dst (AddReductionVD dst src));
4808   match(Set dst (MulReductionVD dst src));
4809   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4810   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4811   ins_encode %{
4812     int opcode = this->ideal_Opcode();
4813     int vlen = Matcher::vector_length(this, $src);
4814     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4815   %}
4816   ins_pipe( pipe_slow );
4817 %}
4818 
4819 // =======================Byte Reduction==========================================
4820 
4821 #ifdef _LP64
4822 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4823   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
4824   match(Set dst (AddReductionVI src1 src2));
4825   match(Set dst (AndReductionV  src1 src2));
4826   match(Set dst ( OrReductionV  src1 src2));
4827   match(Set dst (XorReductionV  src1 src2));
4828   match(Set dst (MinReductionV  src1 src2));
4829   match(Set dst (MaxReductionV  src1 src2));
4830   effect(TEMP vtmp1, TEMP vtmp2);
4831   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4832   ins_encode %{
4833     int opcode = this->ideal_Opcode();
4834     int vlen = Matcher::vector_length(this, $src2);
4835     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4836   %}
4837   ins_pipe( pipe_slow );
4838 %}
4839 
4840 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4841   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
4842   match(Set dst (AddReductionVI src1 src2));
4843   match(Set dst (AndReductionV  src1 src2));
4844   match(Set dst ( OrReductionV  src1 src2));
4845   match(Set dst (XorReductionV  src1 src2));
4846   match(Set dst (MinReductionV  src1 src2));
4847   match(Set dst (MaxReductionV  src1 src2));
4848   effect(TEMP vtmp1, TEMP vtmp2);
4849   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4850   ins_encode %{
4851     int opcode = this->ideal_Opcode();
4852     int vlen = Matcher::vector_length(this, $src2);
4853     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4854   %}
4855   ins_pipe( pipe_slow );
4856 %}
4857 #endif
4858 
4859 // =======================Short Reduction==========================================
4860 
4861 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4862   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
4863   match(Set dst (AddReductionVI src1 src2));
4864   match(Set dst (MulReductionVI src1 src2));
4865   match(Set dst (AndReductionV  src1 src2));
4866   match(Set dst ( OrReductionV  src1 src2));
4867   match(Set dst (XorReductionV  src1 src2));
4868   match(Set dst (MinReductionV  src1 src2));
4869   match(Set dst (MaxReductionV  src1 src2));
4870   effect(TEMP vtmp1, TEMP vtmp2);
4871   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4872   ins_encode %{
4873     int opcode = this->ideal_Opcode();
4874     int vlen = Matcher::vector_length(this, $src2);
4875     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4876   %}
4877   ins_pipe( pipe_slow );
4878 %}
4879 
4880 // =======================Mul Reduction==========================================
4881 
4882 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4883   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
4884             Matcher::vector_length(n->in(2)) <= 32); // src2
4885   match(Set dst (MulReductionVI src1 src2));
4886   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4887   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4888   ins_encode %{
4889     int opcode = this->ideal_Opcode();
4890     int vlen = Matcher::vector_length(this, $src2);
4891     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4892   %}
4893   ins_pipe( pipe_slow );
4894 %}
4895 
4896 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4897   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
4898             Matcher::vector_length(n->in(2)) == 64); // src2
4899   match(Set dst (MulReductionVI src1 src2));
4900   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4901   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4902   ins_encode %{
4903     int opcode = this->ideal_Opcode();
4904     int vlen = Matcher::vector_length(this, $src2);
4905     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4906   %}
4907   ins_pipe( pipe_slow );
4908 %}
4909 
4910 //--------------------Min/Max Float Reduction --------------------
4911 // Float Min Reduction
4912 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
4913                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4914   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4915             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4916              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4917             Matcher::vector_length(n->in(2)) == 2);
4918   match(Set dst (MinReductionV src1 src2));
4919   match(Set dst (MaxReductionV src1 src2));
4920   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4921   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4922   ins_encode %{
4923     assert(UseAVX > 0, "sanity");
4924 
4925     int opcode = this->ideal_Opcode();
4926     int vlen = Matcher::vector_length(this, $src2);
4927     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4928                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4929   %}
4930   ins_pipe( pipe_slow );
4931 %}
4932 
4933 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
4934                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4935   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4936             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4937              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4938             Matcher::vector_length(n->in(2)) >= 4);
4939   match(Set dst (MinReductionV src1 src2));
4940   match(Set dst (MaxReductionV src1 src2));
4941   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4942   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4943   ins_encode %{
4944     assert(UseAVX > 0, "sanity");
4945 
4946     int opcode = this->ideal_Opcode();
4947     int vlen = Matcher::vector_length(this, $src2);
4948     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4949                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4950   %}
4951   ins_pipe( pipe_slow );
4952 %}
4953 
4954 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
4955                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4956   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4957             Matcher::vector_length(n->in(2)) == 2);
4958   match(Set dst (MinReductionV dst src));
4959   match(Set dst (MaxReductionV dst src));
4960   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4961   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4962   ins_encode %{
4963     assert(UseAVX > 0, "sanity");
4964 
4965     int opcode = this->ideal_Opcode();
4966     int vlen = Matcher::vector_length(this, $src);
4967     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4968                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4969   %}
4970   ins_pipe( pipe_slow );
4971 %}
4972 
4973 
4974 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
4975                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4976   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4977             Matcher::vector_length(n->in(2)) >= 4);
4978   match(Set dst (MinReductionV dst src));
4979   match(Set dst (MaxReductionV dst src));
4980   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4981   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4982   ins_encode %{
4983     assert(UseAVX > 0, "sanity");
4984 
4985     int opcode = this->ideal_Opcode();
4986     int vlen = Matcher::vector_length(this, $src);
4987     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4988                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4989   %}
4990   ins_pipe( pipe_slow );
4991 %}
4992 
4993 
4994 //--------------------Min Double Reduction --------------------
4995 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
4996                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
4997                             rFlagsReg cr) %{
4998   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4999             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
5000              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
5001             Matcher::vector_length(n->in(2)) == 2);
5002   match(Set dst (MinReductionV src1 src2));
5003   match(Set dst (MaxReductionV src1 src2));
5004   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
5005   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
5006   ins_encode %{
5007     assert(UseAVX > 0, "sanity");
5008 
5009     int opcode = this->ideal_Opcode();
5010     int vlen = Matcher::vector_length(this, $src2);
5011     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
5012                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
5013   %}
5014   ins_pipe( pipe_slow );
5015 %}
5016 
5017 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
5018                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
5019                            rFlagsReg cr) %{
5020   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5021             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
5022              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
5023             Matcher::vector_length(n->in(2)) >= 4);
5024   match(Set dst (MinReductionV src1 src2));
5025   match(Set dst (MaxReductionV src1 src2));
5026   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
5027   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
5028   ins_encode %{
5029     assert(UseAVX > 0, "sanity");
5030 
5031     int opcode = this->ideal_Opcode();
5032     int vlen = Matcher::vector_length(this, $src2);
5033     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
5034                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
5035   %}
5036   ins_pipe( pipe_slow );
5037 %}
5038 
5039 
5040 instruct minmax_reduction2D_av(legRegD dst, legVec src,
5041                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
5042                                rFlagsReg cr) %{
5043   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5044             Matcher::vector_length(n->in(2)) == 2);
5045   match(Set dst (MinReductionV dst src));
5046   match(Set dst (MaxReductionV dst src));
5047   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
5048   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
5049   ins_encode %{
5050     assert(UseAVX > 0, "sanity");
5051 
5052     int opcode = this->ideal_Opcode();
5053     int vlen = Matcher::vector_length(this, $src);
5054     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
5055                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
5056   %}
5057   ins_pipe( pipe_slow );
5058 %}
5059 
5060 instruct minmax_reductionD_av(legRegD dst, legVec src,
5061                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
5062                               rFlagsReg cr) %{
5063   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5064             Matcher::vector_length(n->in(2)) >= 4);
5065   match(Set dst (MinReductionV dst src));
5066   match(Set dst (MaxReductionV dst src));
5067   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
5068   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
5069   ins_encode %{
5070     assert(UseAVX > 0, "sanity");
5071 
5072     int opcode = this->ideal_Opcode();
5073     int vlen = Matcher::vector_length(this, $src);
5074     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
5075                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
5076   %}
5077   ins_pipe( pipe_slow );
5078 %}
5079 
5080 // ====================VECTOR ARITHMETIC=======================================
5081 
5082 // --------------------------------- ADD --------------------------------------
5083 
5084 // Bytes vector add
5085 instruct vaddB(vec dst, vec src) %{
5086   predicate(UseAVX == 0);
5087   match(Set dst (AddVB dst src));
5088   format %{ "paddb   $dst,$src\t! add packedB" %}
5089   ins_encode %{
5090     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5091   %}
5092   ins_pipe( pipe_slow );
5093 %}
5094 
5095 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
5096   predicate(UseAVX > 0);
5097   match(Set dst (AddVB src1 src2));
5098   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
5099   ins_encode %{
5100     int vlen_enc = vector_length_encoding(this);
5101     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5102   %}
5103   ins_pipe( pipe_slow );
5104 %}
5105 
5106 instruct vaddB_mem(vec dst, vec src, memory mem) %{
5107   predicate((UseAVX > 0) &&
5108             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5109   match(Set dst (AddVB src (LoadVector mem)));
5110   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
5111   ins_encode %{
5112     int vlen_enc = vector_length_encoding(this);
5113     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5114   %}
5115   ins_pipe( pipe_slow );
5116 %}
5117 
5118 // Shorts/Chars vector add
5119 instruct vaddS(vec dst, vec src) %{
5120   predicate(UseAVX == 0);
5121   match(Set dst (AddVS dst src));
5122   format %{ "paddw   $dst,$src\t! add packedS" %}
5123   ins_encode %{
5124     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5125   %}
5126   ins_pipe( pipe_slow );
5127 %}
5128 
5129 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
5130   predicate(UseAVX > 0);
5131   match(Set dst (AddVS src1 src2));
5132   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
5133   ins_encode %{
5134     int vlen_enc = vector_length_encoding(this);
5135     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5136   %}
5137   ins_pipe( pipe_slow );
5138 %}
5139 
5140 instruct vaddS_mem(vec dst, vec src, memory mem) %{
5141   predicate((UseAVX > 0) &&
5142             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5143   match(Set dst (AddVS src (LoadVector mem)));
5144   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
5145   ins_encode %{
5146     int vlen_enc = vector_length_encoding(this);
5147     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5148   %}
5149   ins_pipe( pipe_slow );
5150 %}
5151 
5152 // Integers vector add
5153 instruct vaddI(vec dst, vec src) %{
5154   predicate(UseAVX == 0);
5155   match(Set dst (AddVI dst src));
5156   format %{ "paddd   $dst,$src\t! add packedI" %}
5157   ins_encode %{
5158     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
5159   %}
5160   ins_pipe( pipe_slow );
5161 %}
5162 
5163 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
5164   predicate(UseAVX > 0);
5165   match(Set dst (AddVI src1 src2));
5166   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
5167   ins_encode %{
5168     int vlen_enc = vector_length_encoding(this);
5169     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5170   %}
5171   ins_pipe( pipe_slow );
5172 %}
5173 
5174 
5175 instruct vaddI_mem(vec dst, vec src, memory mem) %{
5176   predicate((UseAVX > 0) &&
5177             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5178   match(Set dst (AddVI src (LoadVector mem)));
5179   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
5180   ins_encode %{
5181     int vlen_enc = vector_length_encoding(this);
5182     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5183   %}
5184   ins_pipe( pipe_slow );
5185 %}
5186 
5187 // Longs vector add
5188 instruct vaddL(vec dst, vec src) %{
5189   predicate(UseAVX == 0);
5190   match(Set dst (AddVL dst src));
5191   format %{ "paddq   $dst,$src\t! add packedL" %}
5192   ins_encode %{
5193     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
5194   %}
5195   ins_pipe( pipe_slow );
5196 %}
5197 
5198 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
5199   predicate(UseAVX > 0);
5200   match(Set dst (AddVL src1 src2));
5201   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
5202   ins_encode %{
5203     int vlen_enc = vector_length_encoding(this);
5204     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5205   %}
5206   ins_pipe( pipe_slow );
5207 %}
5208 
5209 instruct vaddL_mem(vec dst, vec src, memory mem) %{
5210   predicate((UseAVX > 0) &&
5211             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5212   match(Set dst (AddVL src (LoadVector mem)));
5213   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
5214   ins_encode %{
5215     int vlen_enc = vector_length_encoding(this);
5216     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5217   %}
5218   ins_pipe( pipe_slow );
5219 %}
5220 
5221 // Floats vector add
5222 instruct vaddF(vec dst, vec src) %{
5223   predicate(UseAVX == 0);
5224   match(Set dst (AddVF dst src));
5225   format %{ "addps   $dst,$src\t! add packedF" %}
5226   ins_encode %{
5227     __ addps($dst$$XMMRegister, $src$$XMMRegister);
5228   %}
5229   ins_pipe( pipe_slow );
5230 %}
5231 
5232 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
5233   predicate(UseAVX > 0);
5234   match(Set dst (AddVF src1 src2));
5235   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
5236   ins_encode %{
5237     int vlen_enc = vector_length_encoding(this);
5238     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5239   %}
5240   ins_pipe( pipe_slow );
5241 %}
5242 
5243 instruct vaddF_mem(vec dst, vec src, memory mem) %{
5244   predicate((UseAVX > 0) &&
5245             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5246   match(Set dst (AddVF src (LoadVector mem)));
5247   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
5248   ins_encode %{
5249     int vlen_enc = vector_length_encoding(this);
5250     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5251   %}
5252   ins_pipe( pipe_slow );
5253 %}
5254 
5255 // Doubles vector add
5256 instruct vaddD(vec dst, vec src) %{
5257   predicate(UseAVX == 0);
5258   match(Set dst (AddVD dst src));
5259   format %{ "addpd   $dst,$src\t! add packedD" %}
5260   ins_encode %{
5261     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
5262   %}
5263   ins_pipe( pipe_slow );
5264 %}
5265 
5266 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
5267   predicate(UseAVX > 0);
5268   match(Set dst (AddVD src1 src2));
5269   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
5270   ins_encode %{
5271     int vlen_enc = vector_length_encoding(this);
5272     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5273   %}
5274   ins_pipe( pipe_slow );
5275 %}
5276 
5277 instruct vaddD_mem(vec dst, vec src, memory mem) %{
5278   predicate((UseAVX > 0) &&
5279             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5280   match(Set dst (AddVD src (LoadVector mem)));
5281   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
5282   ins_encode %{
5283     int vlen_enc = vector_length_encoding(this);
5284     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5285   %}
5286   ins_pipe( pipe_slow );
5287 %}
5288 
5289 // --------------------------------- SUB --------------------------------------
5290 
5291 // Bytes vector sub
5292 instruct vsubB(vec dst, vec src) %{
5293   predicate(UseAVX == 0);
5294   match(Set dst (SubVB dst src));
5295   format %{ "psubb   $dst,$src\t! sub packedB" %}
5296   ins_encode %{
5297     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
5298   %}
5299   ins_pipe( pipe_slow );
5300 %}
5301 
5302 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
5303   predicate(UseAVX > 0);
5304   match(Set dst (SubVB src1 src2));
5305   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
5306   ins_encode %{
5307     int vlen_enc = vector_length_encoding(this);
5308     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5309   %}
5310   ins_pipe( pipe_slow );
5311 %}
5312 
5313 instruct vsubB_mem(vec dst, vec src, memory mem) %{
5314   predicate((UseAVX > 0) &&
5315             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5316   match(Set dst (SubVB src (LoadVector mem)));
5317   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
5318   ins_encode %{
5319     int vlen_enc = vector_length_encoding(this);
5320     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5321   %}
5322   ins_pipe( pipe_slow );
5323 %}
5324 
5325 // Shorts/Chars vector sub
5326 instruct vsubS(vec dst, vec src) %{
5327   predicate(UseAVX == 0);
5328   match(Set dst (SubVS dst src));
5329   format %{ "psubw   $dst,$src\t! sub packedS" %}
5330   ins_encode %{
5331     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
5332   %}
5333   ins_pipe( pipe_slow );
5334 %}
5335 
5336 
5337 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
5338   predicate(UseAVX > 0);
5339   match(Set dst (SubVS src1 src2));
5340   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
5341   ins_encode %{
5342     int vlen_enc = vector_length_encoding(this);
5343     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5344   %}
5345   ins_pipe( pipe_slow );
5346 %}
5347 
5348 instruct vsubS_mem(vec dst, vec src, memory mem) %{
5349   predicate((UseAVX > 0) &&
5350             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5351   match(Set dst (SubVS src (LoadVector mem)));
5352   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
5353   ins_encode %{
5354     int vlen_enc = vector_length_encoding(this);
5355     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5356   %}
5357   ins_pipe( pipe_slow );
5358 %}
5359 
5360 // Integers vector sub
5361 instruct vsubI(vec dst, vec src) %{
5362   predicate(UseAVX == 0);
5363   match(Set dst (SubVI dst src));
5364   format %{ "psubd   $dst,$src\t! sub packedI" %}
5365   ins_encode %{
5366     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
5367   %}
5368   ins_pipe( pipe_slow );
5369 %}
5370 
5371 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
5372   predicate(UseAVX > 0);
5373   match(Set dst (SubVI src1 src2));
5374   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
5375   ins_encode %{
5376     int vlen_enc = vector_length_encoding(this);
5377     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5378   %}
5379   ins_pipe( pipe_slow );
5380 %}
5381 
5382 instruct vsubI_mem(vec dst, vec src, memory mem) %{
5383   predicate((UseAVX > 0) &&
5384             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5385   match(Set dst (SubVI src (LoadVector mem)));
5386   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
5387   ins_encode %{
5388     int vlen_enc = vector_length_encoding(this);
5389     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5390   %}
5391   ins_pipe( pipe_slow );
5392 %}
5393 
5394 // Longs vector sub
5395 instruct vsubL(vec dst, vec src) %{
5396   predicate(UseAVX == 0);
5397   match(Set dst (SubVL dst src));
5398   format %{ "psubq   $dst,$src\t! sub packedL" %}
5399   ins_encode %{
5400     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
5401   %}
5402   ins_pipe( pipe_slow );
5403 %}
5404 
5405 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
5406   predicate(UseAVX > 0);
5407   match(Set dst (SubVL src1 src2));
5408   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
5409   ins_encode %{
5410     int vlen_enc = vector_length_encoding(this);
5411     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5412   %}
5413   ins_pipe( pipe_slow );
5414 %}
5415 
5416 
5417 instruct vsubL_mem(vec dst, vec src, memory mem) %{
5418   predicate((UseAVX > 0) &&
5419             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5420   match(Set dst (SubVL src (LoadVector mem)));
5421   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
5422   ins_encode %{
5423     int vlen_enc = vector_length_encoding(this);
5424     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5425   %}
5426   ins_pipe( pipe_slow );
5427 %}
5428 
5429 // Floats vector sub
5430 instruct vsubF(vec dst, vec src) %{
5431   predicate(UseAVX == 0);
5432   match(Set dst (SubVF dst src));
5433   format %{ "subps   $dst,$src\t! sub packedF" %}
5434   ins_encode %{
5435     __ subps($dst$$XMMRegister, $src$$XMMRegister);
5436   %}
5437   ins_pipe( pipe_slow );
5438 %}
5439 
5440 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
5441   predicate(UseAVX > 0);
5442   match(Set dst (SubVF src1 src2));
5443   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
5444   ins_encode %{
5445     int vlen_enc = vector_length_encoding(this);
5446     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5447   %}
5448   ins_pipe( pipe_slow );
5449 %}
5450 
5451 instruct vsubF_mem(vec dst, vec src, memory mem) %{
5452   predicate((UseAVX > 0) &&
5453             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5454   match(Set dst (SubVF src (LoadVector mem)));
5455   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
5456   ins_encode %{
5457     int vlen_enc = vector_length_encoding(this);
5458     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5459   %}
5460   ins_pipe( pipe_slow );
5461 %}
5462 
5463 // Doubles vector sub
5464 instruct vsubD(vec dst, vec src) %{
5465   predicate(UseAVX == 0);
5466   match(Set dst (SubVD dst src));
5467   format %{ "subpd   $dst,$src\t! sub packedD" %}
5468   ins_encode %{
5469     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
5470   %}
5471   ins_pipe( pipe_slow );
5472 %}
5473 
5474 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
5475   predicate(UseAVX > 0);
5476   match(Set dst (SubVD src1 src2));
5477   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
5478   ins_encode %{
5479     int vlen_enc = vector_length_encoding(this);
5480     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5481   %}
5482   ins_pipe( pipe_slow );
5483 %}
5484 
5485 instruct vsubD_mem(vec dst, vec src, memory mem) %{
5486   predicate((UseAVX > 0) &&
5487             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5488   match(Set dst (SubVD src (LoadVector mem)));
5489   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
5490   ins_encode %{
5491     int vlen_enc = vector_length_encoding(this);
5492     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5493   %}
5494   ins_pipe( pipe_slow );
5495 %}
5496 
5497 // --------------------------------- MUL --------------------------------------
5498 
5499 // Byte vector mul
5500 instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5501   predicate(Matcher::vector_length(n) == 4 ||
5502             Matcher::vector_length(n) == 8);
5503   match(Set dst (MulVB src1 src2));
5504   effect(TEMP dst, TEMP tmp, TEMP scratch);
5505   format %{"vector_mulB $dst,$src1,$src2" %}
5506   ins_encode %{
5507     assert(UseSSE > 3, "required");
5508     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
5509     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
5510     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
5511     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5512     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5513     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5514   %}
5515   ins_pipe( pipe_slow );
5516 %}
5517 
5518 instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5519   predicate(Matcher::vector_length(n) == 16 && UseAVX <= 1);
5520   match(Set dst (MulVB src1 src2));
5521   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5522   format %{"vector_mulB $dst,$src1,$src2" %}
5523   ins_encode %{
5524     assert(UseSSE > 3, "required");
5525     __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
5526     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
5527     __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
5528     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
5529     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
5530     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
5531     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
5532     __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
5533     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5534     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5535     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5536     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5537   %}
5538   ins_pipe( pipe_slow );
5539 %}
5540 
5541 instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5542   predicate(Matcher::vector_length(n) == 16 && UseAVX > 1);
5543   match(Set dst (MulVB src1 src2));
5544   effect(TEMP dst, TEMP tmp, TEMP scratch);
5545   format %{"vector_mulB $dst,$src1,$src2" %}
5546   ins_encode %{
5547   int vlen_enc = Assembler::AVX_256bit;
5548     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5549     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5550     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5551     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5552     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5553     __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
5554     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
5555   %}
5556   ins_pipe( pipe_slow );
5557 %}
5558 
5559 instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5560   predicate(Matcher::vector_length(n) == 32);
5561   match(Set dst (MulVB src1 src2));
5562   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5563   format %{"vector_mulB $dst,$src1,$src2" %}
5564   ins_encode %{
5565     assert(UseAVX > 1, "required");
5566     int vlen_enc = Assembler::AVX_256bit;
5567     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5568     __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
5569     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5570     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5571     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5572     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5573     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5574     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5575     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5576     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5577     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5578     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5579     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5580     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
5581   %}
5582   ins_pipe( pipe_slow );
5583 %}
5584 
5585 instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5586   predicate(Matcher::vector_length(n) == 64);
5587   match(Set dst (MulVB src1 src2));
5588   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5589   format %{"vector_mulB $dst,$src1,$src2\n\t" %}
5590   ins_encode %{
5591     assert(UseAVX > 2, "required");
5592     int vlen_enc = Assembler::AVX_512bit;
5593     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5594     __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
5595     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5596     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5597     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5598     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5599     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5600     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5601     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5602     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5603     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5604     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5605     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5606     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
5607     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5608   %}
5609   ins_pipe( pipe_slow );
5610 %}
5611 
5612 // Shorts/Chars vector mul
5613 instruct vmulS(vec dst, vec src) %{
5614   predicate(UseAVX == 0);
5615   match(Set dst (MulVS dst src));
5616   format %{ "pmullw $dst,$src\t! mul packedS" %}
5617   ins_encode %{
5618     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
5619   %}
5620   ins_pipe( pipe_slow );
5621 %}
5622 
5623 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
5624   predicate(UseAVX > 0);
5625   match(Set dst (MulVS src1 src2));
5626   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
5627   ins_encode %{
5628     int vlen_enc = vector_length_encoding(this);
5629     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5630   %}
5631   ins_pipe( pipe_slow );
5632 %}
5633 
5634 instruct vmulS_mem(vec dst, vec src, memory mem) %{
5635   predicate((UseAVX > 0) &&
5636             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5637   match(Set dst (MulVS src (LoadVector mem)));
5638   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
5639   ins_encode %{
5640     int vlen_enc = vector_length_encoding(this);
5641     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5642   %}
5643   ins_pipe( pipe_slow );
5644 %}
5645 
5646 // Integers vector mul
5647 instruct vmulI(vec dst, vec src) %{
5648   predicate(UseAVX == 0);
5649   match(Set dst (MulVI dst src));
5650   format %{ "pmulld  $dst,$src\t! mul packedI" %}
5651   ins_encode %{
5652     assert(UseSSE > 3, "required");
5653     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
5654   %}
5655   ins_pipe( pipe_slow );
5656 %}
5657 
5658 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
5659   predicate(UseAVX > 0);
5660   match(Set dst (MulVI src1 src2));
5661   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
5662   ins_encode %{
5663     int vlen_enc = vector_length_encoding(this);
5664     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5665   %}
5666   ins_pipe( pipe_slow );
5667 %}
5668 
5669 instruct vmulI_mem(vec dst, vec src, memory mem) %{
5670   predicate((UseAVX > 0) &&
5671             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5672   match(Set dst (MulVI src (LoadVector mem)));
5673   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
5674   ins_encode %{
5675     int vlen_enc = vector_length_encoding(this);
5676     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5677   %}
5678   ins_pipe( pipe_slow );
5679 %}
5680 
5681 // Longs vector mul
5682 instruct vmulL_reg(vec dst, vec src1, vec src2) %{
5683   predicate(VM_Version::supports_avx512dq());
5684   match(Set dst (MulVL src1 src2));
5685   format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
5686   ins_encode %{
5687     assert(UseAVX > 2, "required");
5688     int vlen_enc = vector_length_encoding(this);
5689     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5690   %}
5691   ins_pipe( pipe_slow );
5692 %}
5693 
5694 instruct vmulL_mem(vec dst, vec src, memory mem) %{
5695   predicate(VM_Version::supports_avx512dq() &&
5696               (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5697   match(Set dst (MulVL src (LoadVector mem)));
5698   format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
5699   ins_encode %{
5700     assert(UseAVX > 2, "required");
5701     int vlen_enc = vector_length_encoding(this);
5702     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5703   %}
5704   ins_pipe( pipe_slow );
5705 %}
5706 
5707 instruct mul2L_reg(vec dst, vec src2, legVec tmp) %{
5708   predicate(Matcher::vector_length(n) == 2 && !VM_Version::supports_avx512dq());
5709   match(Set dst (MulVL dst src2));
5710   effect(TEMP dst, TEMP tmp);
5711   format %{ "pshufd $tmp,$src2, 177\n\t"
5712             "pmulld $tmp,$dst\n\t"
5713             "phaddd $tmp,$tmp\n\t"
5714             "pmovzxdq $tmp,$tmp\n\t"
5715             "psllq $tmp, 32\n\t"
5716             "pmuludq $dst,$src2\n\t"
5717             "paddq $dst,$tmp\n\t! mul packed2L" %}
5718 
5719   ins_encode %{
5720     assert(VM_Version::supports_sse4_1(), "required");
5721     int vlen_enc = Assembler::AVX_128bit;
5722     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
5723     __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
5724     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5725     __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
5726     __ psllq($tmp$$XMMRegister, 32);
5727     __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
5728     __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
5729   %}
5730   ins_pipe( pipe_slow );
5731 %}
5732 
5733 instruct vmul4L_reg_avx(vec dst, vec src1, vec src2, legVec tmp, legVec tmp1) %{
5734   predicate(Matcher::vector_length(n) == 4 && !VM_Version::supports_avx512dq());
5735   match(Set dst (MulVL src1 src2));
5736   effect(TEMP tmp1, TEMP tmp);
5737   format %{ "vpshufd $tmp,$src2\n\t"
5738             "vpmulld $tmp,$src1,$tmp\n\t"
5739             "vphaddd $tmp,$tmp,$tmp\n\t"
5740             "vpmovzxdq $tmp,$tmp\n\t"
5741             "vpsllq $tmp,$tmp\n\t"
5742             "vpmuludq $tmp1,$src1,$src2\n\t"
5743             "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
5744   ins_encode %{
5745     int vlen_enc = Assembler::AVX_256bit;
5746     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vlen_enc);
5747     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5748     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
5749     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5750     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5751     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vlen_enc);
5752     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5753     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5754   %}
5755   ins_pipe( pipe_slow );
5756 %}
5757 
5758 // Floats vector mul
5759 instruct vmulF(vec dst, vec src) %{
5760   predicate(UseAVX == 0);
5761   match(Set dst (MulVF dst src));
5762   format %{ "mulps   $dst,$src\t! mul packedF" %}
5763   ins_encode %{
5764     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
5765   %}
5766   ins_pipe( pipe_slow );
5767 %}
5768 
5769 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
5770   predicate(UseAVX > 0);
5771   match(Set dst (MulVF src1 src2));
5772   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
5773   ins_encode %{
5774     int vlen_enc = vector_length_encoding(this);
5775     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5776   %}
5777   ins_pipe( pipe_slow );
5778 %}
5779 
5780 instruct vmulF_mem(vec dst, vec src, memory mem) %{
5781   predicate((UseAVX > 0) &&
5782             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5783   match(Set dst (MulVF src (LoadVector mem)));
5784   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
5785   ins_encode %{
5786     int vlen_enc = vector_length_encoding(this);
5787     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5788   %}
5789   ins_pipe( pipe_slow );
5790 %}
5791 
5792 // Doubles vector mul
5793 instruct vmulD(vec dst, vec src) %{
5794   predicate(UseAVX == 0);
5795   match(Set dst (MulVD dst src));
5796   format %{ "mulpd   $dst,$src\t! mul packedD" %}
5797   ins_encode %{
5798     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
5799   %}
5800   ins_pipe( pipe_slow );
5801 %}
5802 
5803 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
5804   predicate(UseAVX > 0);
5805   match(Set dst (MulVD src1 src2));
5806   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
5807   ins_encode %{
5808     int vlen_enc = vector_length_encoding(this);
5809     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5810   %}
5811   ins_pipe( pipe_slow );
5812 %}
5813 
5814 instruct vmulD_mem(vec dst, vec src, memory mem) %{
5815   predicate((UseAVX > 0) &&
5816             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5817   match(Set dst (MulVD src (LoadVector mem)));
5818   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
5819   ins_encode %{
5820     int vlen_enc = vector_length_encoding(this);
5821     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5822   %}
5823   ins_pipe( pipe_slow );
5824 %}
5825 
5826 instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5827   predicate(Matcher::vector_length(n) == 8);
5828   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
5829   effect(TEMP dst, USE src1, USE src2);
5830   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
5831             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
5832          %}
5833   ins_encode %{
5834     assert(UseAVX > 0, "required");
5835 
5836     int vlen_enc = Assembler::AVX_256bit;
5837     int cond = (Assembler::Condition)($copnd$$cmpcode);
5838     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5839     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5840   %}
5841   ins_pipe( pipe_slow );
5842 %}
5843 
5844 instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5845   predicate(Matcher::vector_length(n) == 4);
5846   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
5847   effect(TEMP dst, USE src1, USE src2);
5848   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
5849             "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
5850          %}
5851   ins_encode %{
5852     assert(UseAVX > 0, "required");
5853 
5854     int vlen_enc = Assembler::AVX_256bit;
5855     int cond = (Assembler::Condition)($copnd$$cmpcode);
5856     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5857     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5858   %}
5859   ins_pipe( pipe_slow );
5860 %}
5861 
5862 // --------------------------------- DIV --------------------------------------
5863 
5864 // Floats vector div
5865 instruct vdivF(vec dst, vec src) %{
5866   predicate(UseAVX == 0);
5867   match(Set dst (DivVF dst src));
5868   format %{ "divps   $dst,$src\t! div packedF" %}
5869   ins_encode %{
5870     __ divps($dst$$XMMRegister, $src$$XMMRegister);
5871   %}
5872   ins_pipe( pipe_slow );
5873 %}
5874 
5875 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
5876   predicate(UseAVX > 0);
5877   match(Set dst (DivVF src1 src2));
5878   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
5879   ins_encode %{
5880     int vlen_enc = vector_length_encoding(this);
5881     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5882   %}
5883   ins_pipe( pipe_slow );
5884 %}
5885 
5886 instruct vdivF_mem(vec dst, vec src, memory mem) %{
5887   predicate((UseAVX > 0) &&
5888             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5889   match(Set dst (DivVF src (LoadVector mem)));
5890   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
5891   ins_encode %{
5892     int vlen_enc = vector_length_encoding(this);
5893     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5894   %}
5895   ins_pipe( pipe_slow );
5896 %}
5897 
5898 // Doubles vector div
5899 instruct vdivD(vec dst, vec src) %{
5900   predicate(UseAVX == 0);
5901   match(Set dst (DivVD dst src));
5902   format %{ "divpd   $dst,$src\t! div packedD" %}
5903   ins_encode %{
5904     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
5905   %}
5906   ins_pipe( pipe_slow );
5907 %}
5908 
5909 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
5910   predicate(UseAVX > 0);
5911   match(Set dst (DivVD src1 src2));
5912   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
5913   ins_encode %{
5914     int vlen_enc = vector_length_encoding(this);
5915     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5916   %}
5917   ins_pipe( pipe_slow );
5918 %}
5919 
5920 instruct vdivD_mem(vec dst, vec src, memory mem) %{
5921   predicate((UseAVX > 0) &&
5922             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5923   match(Set dst (DivVD src (LoadVector mem)));
5924   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
5925   ins_encode %{
5926     int vlen_enc = vector_length_encoding(this);
5927     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5928   %}
5929   ins_pipe( pipe_slow );
5930 %}
5931 
5932 // ------------------------------ MinMax ---------------------------------------
5933 
5934 // Byte, Short, Int vector Min/Max
5935 instruct minmax_reg_sse(vec dst, vec src) %{
5936   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5937             UseAVX == 0);
5938   match(Set dst (MinV dst src));
5939   match(Set dst (MaxV dst src));
5940   format %{ "vector_minmax  $dst,$src\t!  " %}
5941   ins_encode %{
5942     assert(UseSSE >= 4, "required");
5943 
5944     int opcode = this->ideal_Opcode();
5945     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5946     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
5947   %}
5948   ins_pipe( pipe_slow );
5949 %}
5950 
5951 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
5952   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5953             UseAVX > 0);
5954   match(Set dst (MinV src1 src2));
5955   match(Set dst (MaxV src1 src2));
5956   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
5957   ins_encode %{
5958     int opcode = this->ideal_Opcode();
5959     int vlen_enc = vector_length_encoding(this);
5960     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5961 
5962     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5963   %}
5964   ins_pipe( pipe_slow );
5965 %}
5966 
5967 // Long vector Min/Max
5968 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
5969   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
5970             UseAVX == 0);
5971   match(Set dst (MinV dst src));
5972   match(Set dst (MaxV src dst));
5973   effect(TEMP dst, TEMP tmp);
5974   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
5975   ins_encode %{
5976     assert(UseSSE >= 4, "required");
5977 
5978     int opcode = this->ideal_Opcode();
5979     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5980     assert(elem_bt == T_LONG, "sanity");
5981 
5982     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
5983   %}
5984   ins_pipe( pipe_slow );
5985 %}
5986 
5987 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
5988   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
5989             UseAVX > 0 && !VM_Version::supports_avx512vl());
5990   match(Set dst (MinV src1 src2));
5991   match(Set dst (MaxV src1 src2));
5992   effect(TEMP dst);
5993   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
5994   ins_encode %{
5995     int vlen_enc = vector_length_encoding(this);
5996     int opcode = this->ideal_Opcode();
5997     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5998     assert(elem_bt == T_LONG, "sanity");
5999 
6000     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6001   %}
6002   ins_pipe( pipe_slow );
6003 %}
6004 
6005 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
6006   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
6007             Matcher::vector_element_basic_type(n) == T_LONG);
6008   match(Set dst (MinV src1 src2));
6009   match(Set dst (MaxV src1 src2));
6010   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
6011   ins_encode %{
6012     assert(UseAVX > 2, "required");
6013 
6014     int vlen_enc = vector_length_encoding(this);
6015     int opcode = this->ideal_Opcode();
6016     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6017     assert(elem_bt == T_LONG, "sanity");
6018 
6019     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6020   %}
6021   ins_pipe( pipe_slow );
6022 %}
6023 
6024 // Float/Double vector Min/Max
6025 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
6026   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
6027             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
6028             UseAVX > 0);
6029   match(Set dst (MinV a b));
6030   match(Set dst (MaxV a b));
6031   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
6032   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
6033   ins_encode %{
6034     assert(UseAVX > 0, "required");
6035 
6036     int opcode = this->ideal_Opcode();
6037     int vlen_enc = vector_length_encoding(this);
6038     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6039 
6040     __ vminmax_fp(opcode, elem_bt,
6041                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
6042                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
6043   %}
6044   ins_pipe( pipe_slow );
6045 %}
6046 
6047 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
6048   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
6049             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
6050   match(Set dst (MinV a b));
6051   match(Set dst (MaxV a b));
6052   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
6053   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
6054   ins_encode %{
6055     assert(UseAVX > 2, "required");
6056 
6057     int opcode = this->ideal_Opcode();
6058     int vlen_enc = vector_length_encoding(this);
6059     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6060 
6061     __ evminmax_fp(opcode, elem_bt,
6062                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
6063                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
6064   %}
6065   ins_pipe( pipe_slow );
6066 %}
6067 
6068 // --------------------------------- Signum/CopySign ---------------------------
6069 
6070 instruct signumF_reg(regF dst, regF zero, regF one, rRegP scratch, rFlagsReg cr) %{
6071   match(Set dst (SignumF dst (Binary zero one)));
6072   effect(TEMP scratch, KILL cr);
6073   format %{ "signumF $dst, $dst\t! using $scratch as TEMP" %}
6074   ins_encode %{
6075     int opcode = this->ideal_Opcode();
6076     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
6077   %}
6078   ins_pipe( pipe_slow );
6079 %}
6080 
6081 instruct signumD_reg(regD dst, regD zero, regD one, rRegP scratch, rFlagsReg cr) %{
6082   match(Set dst (SignumD dst (Binary zero one)));
6083   effect(TEMP scratch, KILL cr);
6084   format %{ "signumD $dst, $dst\t! using $scratch as TEMP" %}
6085   ins_encode %{
6086     int opcode = this->ideal_Opcode();
6087     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
6088   %}
6089   ins_pipe( pipe_slow );
6090 %}
6091 
6092 // ---------------------------------------
6093 // For copySign use 0xE4 as writemask for vpternlog
6094 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
6095 // C (xmm2) is set to 0x7FFFFFFF
6096 // Wherever xmm2 is 0, we want to pick from B (sign)
6097 // Wherever xmm2 is 1, we want to pick from A (src)
6098 //
6099 // A B C Result
6100 // 0 0 0 0
6101 // 0 0 1 0
6102 // 0 1 0 1
6103 // 0 1 1 0
6104 // 1 0 0 0
6105 // 1 0 1 1
6106 // 1 1 0 1
6107 // 1 1 1 1
6108 //
6109 // Result going from high bit to low bit is 0x11100100 = 0xe4
6110 // ---------------------------------------
6111 
6112 #ifdef _LP64
6113 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
6114   match(Set dst (CopySignF dst src));
6115   effect(TEMP tmp1, TEMP tmp2);
6116   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
6117   ins_encode %{
6118     __ movl($tmp2$$Register, 0x7FFFFFFF);
6119     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
6120     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
6121   %}
6122   ins_pipe( pipe_slow );
6123 %}
6124 
6125 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
6126   match(Set dst (CopySignD dst (Binary src zero)));
6127   ins_cost(100);
6128   effect(TEMP tmp1, TEMP tmp2);
6129   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
6130   ins_encode %{
6131     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
6132     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
6133     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
6134   %}
6135   ins_pipe( pipe_slow );
6136 %}
6137 #endif // _LP64
6138 
6139 // --------------------------------- Sqrt --------------------------------------
6140 
6141 instruct vsqrtF_reg(vec dst, vec src) %{
6142   match(Set dst (SqrtVF src));
6143   ins_cost(400);
6144   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
6145   ins_encode %{
6146     assert(UseAVX > 0, "required");
6147     int vlen_enc = vector_length_encoding(this);
6148     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6149   %}
6150   ins_pipe( pipe_slow );
6151 %}
6152 
6153 instruct vsqrtF_mem(vec dst, memory mem) %{
6154   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
6155   match(Set dst (SqrtVF (LoadVector mem)));
6156   ins_cost(400);
6157   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
6158   ins_encode %{
6159     assert(UseAVX > 0, "required");
6160     int vlen_enc = vector_length_encoding(this);
6161     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
6162   %}
6163   ins_pipe( pipe_slow );
6164 %}
6165 
6166 // Floating point vector sqrt
6167 instruct vsqrtD_reg(vec dst, vec src) %{
6168   match(Set dst (SqrtVD src));
6169   ins_cost(400);
6170   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
6171   ins_encode %{
6172     assert(UseAVX > 0, "required");
6173     int vlen_enc = vector_length_encoding(this);
6174     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6175   %}
6176   ins_pipe( pipe_slow );
6177 %}
6178 
6179 instruct vsqrtD_mem(vec dst, memory mem) %{
6180   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
6181   match(Set dst (SqrtVD (LoadVector mem)));
6182   ins_cost(400);
6183   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
6184   ins_encode %{
6185     assert(UseAVX > 0, "required");
6186     int vlen_enc = vector_length_encoding(this);
6187     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
6188   %}
6189   ins_pipe( pipe_slow );
6190 %}
6191 
6192 // ------------------------------ Shift ---------------------------------------
6193 
6194 // Left and right shift count vectors are the same on x86
6195 // (only lowest bits of xmm reg are used for count).
6196 instruct vshiftcnt(vec dst, rRegI cnt) %{
6197   match(Set dst (LShiftCntV cnt));
6198   match(Set dst (RShiftCntV cnt));
6199   format %{ "movdl    $dst,$cnt\t! load shift count" %}
6200   ins_encode %{
6201     __ movdl($dst$$XMMRegister, $cnt$$Register);
6202   %}
6203   ins_pipe( pipe_slow );
6204 %}
6205 
6206 // Byte vector shift
6207 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6208   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
6209   match(Set dst ( LShiftVB src shift));
6210   match(Set dst ( RShiftVB src shift));
6211   match(Set dst (URShiftVB src shift));
6212   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
6213   format %{"vector_byte_shift $dst,$src,$shift" %}
6214   ins_encode %{
6215     assert(UseSSE > 3, "required");
6216     int opcode = this->ideal_Opcode();
6217     bool sign = (opcode != Op_URShiftVB);
6218     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
6219     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
6220     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6221     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
6222     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
6223   %}
6224   ins_pipe( pipe_slow );
6225 %}
6226 
6227 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
6228   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
6229             UseAVX <= 1);
6230   match(Set dst ( LShiftVB src shift));
6231   match(Set dst ( RShiftVB src shift));
6232   match(Set dst (URShiftVB src shift));
6233   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
6234   format %{"vector_byte_shift $dst,$src,$shift" %}
6235   ins_encode %{
6236     assert(UseSSE > 3, "required");
6237     int opcode = this->ideal_Opcode();
6238     bool sign = (opcode != Op_URShiftVB);
6239     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
6240     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
6241     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
6242     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
6243     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
6244     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6245     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
6246     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
6247     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
6248   %}
6249   ins_pipe( pipe_slow );
6250 %}
6251 
6252 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6253   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
6254             UseAVX > 1);
6255   match(Set dst ( LShiftVB src shift));
6256   match(Set dst ( RShiftVB src shift));
6257   match(Set dst (URShiftVB src shift));
6258   effect(TEMP dst, TEMP tmp, TEMP scratch);
6259   format %{"vector_byte_shift $dst,$src,$shift" %}
6260   ins_encode %{
6261     int opcode = this->ideal_Opcode();
6262     bool sign = (opcode != Op_URShiftVB);
6263     int vlen_enc = Assembler::AVX_256bit;
6264     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
6265     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6266     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6267     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
6268     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
6269   %}
6270   ins_pipe( pipe_slow );
6271 %}
6272 
6273 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6274   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
6275   match(Set dst ( LShiftVB src shift));
6276   match(Set dst ( RShiftVB src shift));
6277   match(Set dst (URShiftVB src shift));
6278   effect(TEMP dst, TEMP tmp, TEMP scratch);
6279   format %{"vector_byte_shift $dst,$src,$shift" %}
6280   ins_encode %{
6281     assert(UseAVX > 1, "required");
6282     int opcode = this->ideal_Opcode();
6283     bool sign = (opcode != Op_URShiftVB);
6284     int vlen_enc = Assembler::AVX_256bit;
6285     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
6286     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6287     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6288     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6289     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6290     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6291     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6292     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6293     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6294   %}
6295   ins_pipe( pipe_slow );
6296 %}
6297 
6298 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
6299   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
6300   match(Set dst ( LShiftVB src shift));
6301   match(Set dst  (RShiftVB src shift));
6302   match(Set dst (URShiftVB src shift));
6303   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
6304   format %{"vector_byte_shift $dst,$src,$shift" %}
6305   ins_encode %{
6306     assert(UseAVX > 2, "required");
6307     int opcode = this->ideal_Opcode();
6308     bool sign = (opcode != Op_URShiftVB);
6309     int vlen_enc = Assembler::AVX_512bit;
6310     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
6311     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
6312     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6313     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6314     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6315     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6316     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6317     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6318     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6319     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
6320     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
6321     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6322   %}
6323   ins_pipe( pipe_slow );
6324 %}
6325 
6326 // Shorts vector logical right shift produces incorrect Java result
6327 // for negative data because java code convert short value into int with
6328 // sign extension before a shift. But char vectors are fine since chars are
6329 // unsigned values.
6330 // Shorts/Chars vector left shift
6331 instruct vshiftS(vec dst, vec src, vec shift) %{
6332   predicate(!n->as_ShiftV()->is_var_shift());
6333   match(Set dst ( LShiftVS src shift));
6334   match(Set dst ( RShiftVS src shift));
6335   match(Set dst (URShiftVS src shift));
6336   effect(TEMP dst, USE src, USE shift);
6337   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
6338   ins_encode %{
6339     int opcode = this->ideal_Opcode();
6340     if (UseAVX > 0) {
6341       int vlen_enc = vector_length_encoding(this);
6342       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6343     } else {
6344       int vlen = Matcher::vector_length(this);
6345       if (vlen == 2) {
6346         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6347         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6348       } else if (vlen == 4) {
6349         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6350         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6351       } else {
6352         assert (vlen == 8, "sanity");
6353         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6354         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6355       }
6356     }
6357   %}
6358   ins_pipe( pipe_slow );
6359 %}
6360 
6361 // Integers vector left shift
6362 instruct vshiftI(vec dst, vec src, vec shift) %{
6363   predicate(!n->as_ShiftV()->is_var_shift());
6364   match(Set dst ( LShiftVI src shift));
6365   match(Set dst ( RShiftVI src shift));
6366   match(Set dst (URShiftVI src shift));
6367   effect(TEMP dst, USE src, USE shift);
6368   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
6369   ins_encode %{
6370     int opcode = this->ideal_Opcode();
6371     if (UseAVX > 0) {
6372       int vlen_enc = vector_length_encoding(this);
6373       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6374     } else {
6375       int vlen = Matcher::vector_length(this);
6376       if (vlen == 2) {
6377         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6378         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6379       } else {
6380         assert(vlen == 4, "sanity");
6381         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6382         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6383       }
6384     }
6385   %}
6386   ins_pipe( pipe_slow );
6387 %}
6388 
6389 // Integers vector left constant shift
6390 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
6391   match(Set dst (LShiftVI src (LShiftCntV shift)));
6392   match(Set dst (RShiftVI src (RShiftCntV shift)));
6393   match(Set dst (URShiftVI src (RShiftCntV shift)));
6394   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
6395   ins_encode %{
6396     int opcode = this->ideal_Opcode();
6397     if (UseAVX > 0) {
6398       int vector_len = vector_length_encoding(this);
6399       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6400     } else {
6401       int vlen = Matcher::vector_length(this);
6402       if (vlen == 2) {
6403         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6404         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6405       } else {
6406         assert(vlen == 4, "sanity");
6407         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6408         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6409       }
6410     }
6411   %}
6412   ins_pipe( pipe_slow );
6413 %}
6414 
6415 // Longs vector shift
6416 instruct vshiftL(vec dst, vec src, vec shift) %{
6417   predicate(!n->as_ShiftV()->is_var_shift());
6418   match(Set dst ( LShiftVL src shift));
6419   match(Set dst (URShiftVL src shift));
6420   effect(TEMP dst, USE src, USE shift);
6421   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
6422   ins_encode %{
6423     int opcode = this->ideal_Opcode();
6424     if (UseAVX > 0) {
6425       int vlen_enc = vector_length_encoding(this);
6426       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6427     } else {
6428       assert(Matcher::vector_length(this) == 2, "");
6429       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6430       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6431     }
6432   %}
6433   ins_pipe( pipe_slow );
6434 %}
6435 
6436 // Longs vector constant shift
6437 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
6438   match(Set dst (LShiftVL src (LShiftCntV shift)));
6439   match(Set dst (URShiftVL src (RShiftCntV shift)));
6440   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
6441   ins_encode %{
6442     int opcode = this->ideal_Opcode();
6443     if (UseAVX > 0) {
6444       int vector_len = vector_length_encoding(this);
6445       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6446     } else {
6447       assert(Matcher::vector_length(this) == 2, "");
6448       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6449       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6450     }
6451   %}
6452   ins_pipe( pipe_slow );
6453 %}
6454 
6455 // -------------------ArithmeticRightShift -----------------------------------
6456 // Long vector arithmetic right shift
6457 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6458   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
6459   match(Set dst (RShiftVL src shift));
6460   effect(TEMP dst, TEMP tmp, TEMP scratch);
6461   format %{ "vshiftq $dst,$src,$shift" %}
6462   ins_encode %{
6463     uint vlen = Matcher::vector_length(this);
6464     if (vlen == 2) {
6465       assert(UseSSE >= 2, "required");
6466       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6467       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
6468       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6469       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
6470       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
6471       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
6472     } else {
6473       assert(vlen == 4, "sanity");
6474       assert(UseAVX > 1, "required");
6475       int vlen_enc = Assembler::AVX_256bit;
6476       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6477       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6478       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6479       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6480       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6481     }
6482   %}
6483   ins_pipe( pipe_slow );
6484 %}
6485 
6486 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
6487   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
6488   match(Set dst (RShiftVL src shift));
6489   format %{ "vshiftq $dst,$src,$shift" %}
6490   ins_encode %{
6491     int vlen_enc = vector_length_encoding(this);
6492     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6493   %}
6494   ins_pipe( pipe_slow );
6495 %}
6496 
6497 // ------------------- Variable Shift -----------------------------
6498 // Byte variable shift
6499 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6500   predicate(Matcher::vector_length(n) <= 8 &&
6501             n->as_ShiftV()->is_var_shift() &&
6502             !VM_Version::supports_avx512bw());
6503   match(Set dst ( LShiftVB src shift));
6504   match(Set dst ( RShiftVB src shift));
6505   match(Set dst (URShiftVB src shift));
6506   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6507   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6508   ins_encode %{
6509     assert(UseAVX >= 2, "required");
6510 
6511     int opcode = this->ideal_Opcode();
6512     int vlen_enc = Assembler::AVX_128bit;
6513     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6514     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6515   %}
6516   ins_pipe( pipe_slow );
6517 %}
6518 
6519 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6520   predicate(Matcher::vector_length(n) == 16 &&
6521             n->as_ShiftV()->is_var_shift() &&
6522             !VM_Version::supports_avx512bw());
6523   match(Set dst ( LShiftVB src shift));
6524   match(Set dst ( RShiftVB src shift));
6525   match(Set dst (URShiftVB src shift));
6526   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6527   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6528   ins_encode %{
6529     assert(UseAVX >= 2, "required");
6530 
6531     int opcode = this->ideal_Opcode();
6532     int vlen_enc = Assembler::AVX_128bit;
6533     // Shift lower half and get word result in dst
6534     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6535 
6536     // Shift upper half and get word result in vtmp1
6537     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6538     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6539     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6540 
6541     // Merge and down convert the two word results to byte in dst
6542     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6543   %}
6544   ins_pipe( pipe_slow );
6545 %}
6546 
6547 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{
6548   predicate(Matcher::vector_length(n) == 32 &&
6549             n->as_ShiftV()->is_var_shift() &&
6550             !VM_Version::supports_avx512bw());
6551   match(Set dst ( LShiftVB src shift));
6552   match(Set dst ( RShiftVB src shift));
6553   match(Set dst (URShiftVB src shift));
6554   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch);
6555   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %}
6556   ins_encode %{
6557     assert(UseAVX >= 2, "required");
6558 
6559     int opcode = this->ideal_Opcode();
6560     int vlen_enc = Assembler::AVX_128bit;
6561     // Process lower 128 bits and get result in dst
6562     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6563     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6564     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6565     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6566     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6567 
6568     // Process higher 128 bits and get result in vtmp3
6569     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6570     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6571     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register);
6572     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
6573     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
6574     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6575     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
6576 
6577     // Merge the two results in dst
6578     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6579   %}
6580   ins_pipe( pipe_slow );
6581 %}
6582 
6583 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6584   predicate(Matcher::vector_length(n) <= 32 &&
6585             n->as_ShiftV()->is_var_shift() &&
6586             VM_Version::supports_avx512bw());
6587   match(Set dst ( LShiftVB src shift));
6588   match(Set dst ( RShiftVB src shift));
6589   match(Set dst (URShiftVB src shift));
6590   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6591   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6592   ins_encode %{
6593     assert(UseAVX > 2, "required");
6594 
6595     int opcode = this->ideal_Opcode();
6596     int vlen_enc = vector_length_encoding(this);
6597     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6598   %}
6599   ins_pipe( pipe_slow );
6600 %}
6601 
6602 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6603   predicate(Matcher::vector_length(n) == 64 &&
6604             n->as_ShiftV()->is_var_shift() &&
6605             VM_Version::supports_avx512bw());
6606   match(Set dst ( LShiftVB src shift));
6607   match(Set dst ( RShiftVB src shift));
6608   match(Set dst (URShiftVB src shift));
6609   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6610   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6611   ins_encode %{
6612     assert(UseAVX > 2, "required");
6613 
6614     int opcode = this->ideal_Opcode();
6615     int vlen_enc = Assembler::AVX_256bit;
6616     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6617     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6618     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6619     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6620     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6621   %}
6622   ins_pipe( pipe_slow );
6623 %}
6624 
6625 // Short variable shift
6626 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6627   predicate(Matcher::vector_length(n) <= 8 &&
6628             n->as_ShiftV()->is_var_shift() &&
6629             !VM_Version::supports_avx512bw());
6630   match(Set dst ( LShiftVS src shift));
6631   match(Set dst ( RShiftVS src shift));
6632   match(Set dst (URShiftVS src shift));
6633   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6634   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6635   ins_encode %{
6636     assert(UseAVX >= 2, "required");
6637 
6638     int opcode = this->ideal_Opcode();
6639     bool sign = (opcode != Op_URShiftVS);
6640     int vlen_enc = Assembler::AVX_256bit;
6641     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
6642     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
6643     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
6644     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6645     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
6646     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6647   %}
6648   ins_pipe( pipe_slow );
6649 %}
6650 
6651 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6652   predicate(Matcher::vector_length(n) == 16 &&
6653             n->as_ShiftV()->is_var_shift() &&
6654             !VM_Version::supports_avx512bw());
6655   match(Set dst ( LShiftVS src shift));
6656   match(Set dst ( RShiftVS src shift));
6657   match(Set dst (URShiftVS src shift));
6658   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6659   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6660   ins_encode %{
6661     assert(UseAVX >= 2, "required");
6662 
6663     int opcode = this->ideal_Opcode();
6664     bool sign = (opcode != Op_URShiftVS);
6665     int vlen_enc = Assembler::AVX_256bit;
6666     // Shift lower half, with result in vtmp2 using vtmp1 as TEMP
6667     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6668     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6669     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6670     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6671 
6672     // Shift upper half, with result in dst using vtmp1 as TEMP
6673     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
6674     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
6675     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6676     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6677     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6678     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6679 
6680     // Merge lower and upper half result into dst
6681     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6682     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6683   %}
6684   ins_pipe( pipe_slow );
6685 %}
6686 
6687 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
6688   predicate(n->as_ShiftV()->is_var_shift() &&
6689             VM_Version::supports_avx512bw());
6690   match(Set dst ( LShiftVS src shift));
6691   match(Set dst ( RShiftVS src shift));
6692   match(Set dst (URShiftVS src shift));
6693   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
6694   ins_encode %{
6695     assert(UseAVX > 2, "required");
6696 
6697     int opcode = this->ideal_Opcode();
6698     int vlen_enc = vector_length_encoding(this);
6699     if (!VM_Version::supports_avx512vl()) {
6700       vlen_enc = Assembler::AVX_512bit;
6701     }
6702     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6703   %}
6704   ins_pipe( pipe_slow );
6705 %}
6706 
6707 //Integer variable shift
6708 instruct vshiftI_var(vec dst, vec src, vec shift) %{
6709   predicate(n->as_ShiftV()->is_var_shift());
6710   match(Set dst ( LShiftVI src shift));
6711   match(Set dst ( RShiftVI src shift));
6712   match(Set dst (URShiftVI src shift));
6713   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
6714   ins_encode %{
6715     assert(UseAVX >= 2, "required");
6716 
6717     int opcode = this->ideal_Opcode();
6718     int vlen_enc = vector_length_encoding(this);
6719     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6720   %}
6721   ins_pipe( pipe_slow );
6722 %}
6723 
6724 //Long variable shift
6725 instruct vshiftL_var(vec dst, vec src, vec shift) %{
6726   predicate(n->as_ShiftV()->is_var_shift());
6727   match(Set dst ( LShiftVL src shift));
6728   match(Set dst (URShiftVL src shift));
6729   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
6730   ins_encode %{
6731     assert(UseAVX >= 2, "required");
6732 
6733     int opcode = this->ideal_Opcode();
6734     int vlen_enc = vector_length_encoding(this);
6735     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6736   %}
6737   ins_pipe( pipe_slow );
6738 %}
6739 
6740 //Long variable right shift arithmetic
6741 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
6742   predicate(Matcher::vector_length(n) <= 4 &&
6743             n->as_ShiftV()->is_var_shift() &&
6744             UseAVX == 2);
6745   match(Set dst (RShiftVL src shift));
6746   effect(TEMP dst, TEMP vtmp);
6747   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
6748   ins_encode %{
6749     int opcode = this->ideal_Opcode();
6750     int vlen_enc = vector_length_encoding(this);
6751     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
6752                  $vtmp$$XMMRegister);
6753   %}
6754   ins_pipe( pipe_slow );
6755 %}
6756 
6757 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
6758   predicate(n->as_ShiftV()->is_var_shift() &&
6759             UseAVX > 2);
6760   match(Set dst (RShiftVL src shift));
6761   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
6762   ins_encode %{
6763     int opcode = this->ideal_Opcode();
6764     int vlen_enc = vector_length_encoding(this);
6765     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6766   %}
6767   ins_pipe( pipe_slow );
6768 %}
6769 
6770 // --------------------------------- AND --------------------------------------
6771 
6772 instruct vand(vec dst, vec src) %{
6773   predicate(UseAVX == 0);
6774   match(Set dst (AndV dst src));
6775   format %{ "pand    $dst,$src\t! and vectors" %}
6776   ins_encode %{
6777     __ pand($dst$$XMMRegister, $src$$XMMRegister);
6778   %}
6779   ins_pipe( pipe_slow );
6780 %}
6781 
6782 instruct vand_reg(vec dst, vec src1, vec src2) %{
6783   predicate(UseAVX > 0);
6784   match(Set dst (AndV src1 src2));
6785   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
6786   ins_encode %{
6787     int vlen_enc = vector_length_encoding(this);
6788     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6789   %}
6790   ins_pipe( pipe_slow );
6791 %}
6792 
6793 instruct vand_mem(vec dst, vec src, memory mem) %{
6794   predicate((UseAVX > 0) &&
6795             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6796   match(Set dst (AndV src (LoadVector mem)));
6797   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
6798   ins_encode %{
6799     int vlen_enc = vector_length_encoding(this);
6800     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6801   %}
6802   ins_pipe( pipe_slow );
6803 %}
6804 
6805 // --------------------------------- OR ---------------------------------------
6806 
6807 instruct vor(vec dst, vec src) %{
6808   predicate(UseAVX == 0);
6809   match(Set dst (OrV dst src));
6810   format %{ "por     $dst,$src\t! or vectors" %}
6811   ins_encode %{
6812     __ por($dst$$XMMRegister, $src$$XMMRegister);
6813   %}
6814   ins_pipe( pipe_slow );
6815 %}
6816 
6817 instruct vor_reg(vec dst, vec src1, vec src2) %{
6818   predicate(UseAVX > 0);
6819   match(Set dst (OrV src1 src2));
6820   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
6821   ins_encode %{
6822     int vlen_enc = vector_length_encoding(this);
6823     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6824   %}
6825   ins_pipe( pipe_slow );
6826 %}
6827 
6828 instruct vor_mem(vec dst, vec src, memory mem) %{
6829   predicate((UseAVX > 0) &&
6830             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6831   match(Set dst (OrV src (LoadVector mem)));
6832   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
6833   ins_encode %{
6834     int vlen_enc = vector_length_encoding(this);
6835     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6836   %}
6837   ins_pipe( pipe_slow );
6838 %}
6839 
6840 // --------------------------------- XOR --------------------------------------
6841 
6842 instruct vxor(vec dst, vec src) %{
6843   predicate(UseAVX == 0);
6844   match(Set dst (XorV dst src));
6845   format %{ "pxor    $dst,$src\t! xor vectors" %}
6846   ins_encode %{
6847     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
6848   %}
6849   ins_pipe( pipe_slow );
6850 %}
6851 
6852 instruct vxor_reg(vec dst, vec src1, vec src2) %{
6853   predicate(UseAVX > 0);
6854   match(Set dst (XorV src1 src2));
6855   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
6856   ins_encode %{
6857     int vlen_enc = vector_length_encoding(this);
6858     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6859   %}
6860   ins_pipe( pipe_slow );
6861 %}
6862 
6863 instruct vxor_mem(vec dst, vec src, memory mem) %{
6864   predicate((UseAVX > 0) &&
6865             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6866   match(Set dst (XorV src (LoadVector mem)));
6867   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
6868   ins_encode %{
6869     int vlen_enc = vector_length_encoding(this);
6870     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6871   %}
6872   ins_pipe( pipe_slow );
6873 %}
6874 
6875 // --------------------------------- VectorCast --------------------------------------
6876 
6877 instruct vcastBtoX(vec dst, vec src) %{
6878   match(Set dst (VectorCastB2X src));
6879   format %{ "vector_cast_b2x $dst,$src\t!" %}
6880   ins_encode %{
6881     assert(UseAVX > 0, "required");
6882 
6883     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6884     int vlen_enc = vector_length_encoding(this);
6885     switch (to_elem_bt) {
6886       case T_SHORT:
6887         __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6888         break;
6889       case T_INT:
6890         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6891         break;
6892       case T_FLOAT:
6893         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6894         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6895         break;
6896       case T_LONG:
6897         __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6898         break;
6899       case T_DOUBLE: {
6900         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
6901         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
6902         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6903         break;
6904       }
6905       default: assert(false, "%s", type2name(to_elem_bt));
6906     }
6907   %}
6908   ins_pipe( pipe_slow );
6909 %}
6910 
6911 instruct castStoX(vec dst, vec src, rRegP scratch) %{
6912   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6913             Matcher::vector_length(n->in(1)) <= 8 && // src
6914             Matcher::vector_element_basic_type(n) == T_BYTE);
6915   effect(TEMP scratch);
6916   match(Set dst (VectorCastS2X src));
6917   format %{ "vector_cast_s2x $dst,$src\t! using $scratch as TEMP" %}
6918   ins_encode %{
6919     assert(UseAVX > 0, "required");
6920 
6921     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, $scratch$$Register);
6922     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6923   %}
6924   ins_pipe( pipe_slow );
6925 %}
6926 
6927 instruct vcastStoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6928   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6929             Matcher::vector_length(n->in(1)) == 16 && // src
6930             Matcher::vector_element_basic_type(n) == T_BYTE);
6931   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6932   match(Set dst (VectorCastS2X src));
6933   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp, $scratch as TEMP" %}
6934   ins_encode %{
6935     assert(UseAVX > 0, "required");
6936 
6937     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
6938     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6939     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
6940     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6941   %}
6942   ins_pipe( pipe_slow );
6943 %}
6944 
6945 instruct vcastStoX_evex(vec dst, vec src) %{
6946   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
6947             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
6948   match(Set dst (VectorCastS2X src));
6949   format %{ "vector_cast_s2x $dst,$src\t!" %}
6950   ins_encode %{
6951     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6952     int src_vlen_enc = vector_length_encoding(this, $src);
6953     int vlen_enc = vector_length_encoding(this);
6954     switch (to_elem_bt) {
6955       case T_BYTE:
6956         if (!VM_Version::supports_avx512vl()) {
6957           vlen_enc = Assembler::AVX_512bit;
6958         }
6959         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6960         break;
6961       case T_INT:
6962         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6963         break;
6964       case T_FLOAT:
6965         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6966         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6967         break;
6968       case T_LONG:
6969         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6970         break;
6971       case T_DOUBLE: {
6972         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
6973         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
6974         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6975         break;
6976       }
6977       default:
6978         ShouldNotReachHere();
6979     }
6980   %}
6981   ins_pipe( pipe_slow );
6982 %}
6983 
6984 instruct castItoX(vec dst, vec src, rRegP scratch) %{
6985   predicate(UseAVX <= 2 &&
6986             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
6987             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
6988   match(Set dst (VectorCastI2X src));
6989   format %{ "vector_cast_i2x $dst,$src\t! using $scratch as TEMP" %}
6990   effect(TEMP scratch);
6991   ins_encode %{
6992     assert(UseAVX > 0, "required");
6993 
6994     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6995     int vlen_enc = vector_length_encoding(this, $src);
6996 
6997     if (to_elem_bt == T_BYTE) {
6998       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
6999       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7000       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7001     } else {
7002       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
7003       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
7004       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7005     }
7006   %}
7007   ins_pipe( pipe_slow );
7008 %}
7009 
7010 instruct vcastItoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
7011   predicate(UseAVX <= 2 &&
7012             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
7013             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
7014   match(Set dst (VectorCastI2X src));
7015   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp and $scratch as TEMP" %}
7016   effect(TEMP dst, TEMP vtmp, TEMP scratch);
7017   ins_encode %{
7018     assert(UseAVX > 0, "required");
7019 
7020     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7021     int vlen_enc = vector_length_encoding(this, $src);
7022 
7023     if (to_elem_bt == T_BYTE) {
7024       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
7025       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
7026       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7027       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7028     } else {
7029       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
7030       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
7031       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
7032       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7033     }
7034   %}
7035   ins_pipe( pipe_slow );
7036 %}
7037 
7038 instruct vcastItoX_evex(vec dst, vec src) %{
7039   predicate(UseAVX > 2 ||
7040             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
7041   match(Set dst (VectorCastI2X src));
7042   format %{ "vector_cast_i2x $dst,$src\t!" %}
7043   ins_encode %{
7044     assert(UseAVX > 0, "required");
7045 
7046     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
7047     int src_vlen_enc = vector_length_encoding(this, $src);
7048     int dst_vlen_enc = vector_length_encoding(this);
7049     switch (dst_elem_bt) {
7050       case T_BYTE:
7051         if (!VM_Version::supports_avx512vl()) {
7052           src_vlen_enc = Assembler::AVX_512bit;
7053         }
7054         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7055         break;
7056       case T_SHORT:
7057         if (!VM_Version::supports_avx512vl()) {
7058           src_vlen_enc = Assembler::AVX_512bit;
7059         }
7060         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7061         break;
7062       case T_FLOAT:
7063         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7064         break;
7065       case T_LONG:
7066         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7067         break;
7068       case T_DOUBLE:
7069         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7070         break;
7071       default:
7072         ShouldNotReachHere();
7073     }
7074   %}
7075   ins_pipe( pipe_slow );
7076 %}
7077 
7078 instruct vcastLtoBS(vec dst, vec src, rRegP scratch) %{
7079   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
7080             UseAVX <= 2);
7081   match(Set dst (VectorCastL2X src));
7082   effect(TEMP scratch);
7083   format %{ "vector_cast_l2x  $dst,$src\t! using $scratch as TEMP" %}
7084   ins_encode %{
7085     assert(UseAVX > 0, "required");
7086 
7087     int vlen = Matcher::vector_length_in_bytes(this, $src);
7088     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
7089     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
7090                                                       : ExternalAddress(vector_int_to_short_mask());
7091     if (vlen <= 16) {
7092       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
7093       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
7094       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7095     } else {
7096       assert(vlen <= 32, "required");
7097       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
7098       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
7099       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
7100       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7101     }
7102     if (to_elem_bt == T_BYTE) {
7103       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7104     }
7105   %}
7106   ins_pipe( pipe_slow );
7107 %}
7108 
7109 instruct vcastLtoX_evex(vec dst, vec src) %{
7110   predicate(UseAVX > 2 ||
7111             (Matcher::vector_element_basic_type(n) == T_INT ||
7112              Matcher::vector_element_basic_type(n) == T_FLOAT ||
7113              Matcher::vector_element_basic_type(n) == T_DOUBLE));
7114   match(Set dst (VectorCastL2X src));
7115   format %{ "vector_cast_l2x  $dst,$src\t!" %}
7116   ins_encode %{
7117     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7118     int vlen = Matcher::vector_length_in_bytes(this, $src);
7119     int vlen_enc = vector_length_encoding(this, $src);
7120     switch (to_elem_bt) {
7121       case T_BYTE:
7122         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
7123           vlen_enc = Assembler::AVX_512bit;
7124         }
7125         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7126         break;
7127       case T_SHORT:
7128         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
7129           vlen_enc = Assembler::AVX_512bit;
7130         }
7131         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7132         break;
7133       case T_INT:
7134         if (vlen == 8) {
7135           if ($dst$$XMMRegister != $src$$XMMRegister) {
7136             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
7137           }
7138         } else if (vlen == 16) {
7139           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
7140         } else if (vlen == 32) {
7141           if (UseAVX > 2) {
7142             if (!VM_Version::supports_avx512vl()) {
7143               vlen_enc = Assembler::AVX_512bit;
7144             }
7145             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7146           } else {
7147             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
7148             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
7149           }
7150         } else { // vlen == 64
7151           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7152         }
7153         break;
7154       case T_FLOAT:
7155         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
7156         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7157         break;
7158       case T_DOUBLE:
7159         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
7160         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7161         break;
7162 
7163       default: assert(false, "%s", type2name(to_elem_bt));
7164     }
7165   %}
7166   ins_pipe( pipe_slow );
7167 %}
7168 
7169 instruct vcastFtoD_reg(vec dst, vec src) %{
7170   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
7171   match(Set dst (VectorCastF2X src));
7172   format %{ "vector_cast_f2d  $dst,$src\t!" %}
7173   ins_encode %{
7174     int vlen_enc = vector_length_encoding(this);
7175     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7176   %}
7177   ins_pipe( pipe_slow );
7178 %}
7179 
7180 
7181 instruct castFtoI_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rRegP scratch, rFlagsReg cr) %{
7182   predicate(!VM_Version::supports_avx512vl() &&
7183             Matcher::vector_length_in_bytes(n) < 64 &&
7184             Matcher::vector_element_basic_type(n) == T_INT);
7185   match(Set dst (VectorCastF2X src));
7186   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP scratch, KILL cr);
7187   format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $scratch as TEMP" %}
7188   ins_encode %{
7189     int vlen_enc = vector_length_encoding(this);
7190     __ vector_castF2I_avx($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7191                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
7192                           ExternalAddress(vector_float_signflip()), $scratch$$Register, vlen_enc);
7193   %}
7194   ins_pipe( pipe_slow );
7195 %}
7196 
7197 instruct castFtoI_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
7198   predicate((VM_Version::supports_avx512vl() ||
7199              Matcher::vector_length_in_bytes(n) == 64) &&
7200              Matcher::vector_element_basic_type(n) == T_INT);
7201   match(Set dst (VectorCastF2X src));
7202   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr);
7203   format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 and $scratch as TEMP" %}
7204   ins_encode %{
7205     int vlen_enc = vector_length_encoding(this);
7206     __ vector_castF2I_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7207                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
7208                            ExternalAddress(vector_float_signflip()), $scratch$$Register, vlen_enc);
7209   %}
7210   ins_pipe( pipe_slow );
7211 %}
7212 
7213 instruct vcastDtoF_reg(vec dst, vec src) %{
7214   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
7215   match(Set dst (VectorCastD2X src));
7216   format %{ "vector_cast_d2x  $dst,$src\t!" %}
7217   ins_encode %{
7218     int vlen_enc = vector_length_encoding(this, $src);
7219     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7220   %}
7221   ins_pipe( pipe_slow );
7222 %}
7223 
7224 instruct castDtoL_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
7225   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
7226   match(Set dst (VectorCastD2X src));
7227   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr);
7228   format %{ "vector_cast_d2l $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 and $scratch as TEMP" %}
7229   ins_encode %{
7230     int vlen_enc = vector_length_encoding(this);
7231     __ vector_castD2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7232                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
7233                            ExternalAddress(vector_double_signflip()), $scratch$$Register, vlen_enc);
7234   %}
7235   ins_pipe( pipe_slow );
7236 %}
7237 
7238 instruct vucast(vec dst, vec src) %{
7239   match(Set dst (VectorUCastB2X src));
7240   match(Set dst (VectorUCastS2X src));
7241   match(Set dst (VectorUCastI2X src));
7242   format %{ "vector_ucast $dst,$src\t!" %}
7243   ins_encode %{
7244     assert(UseAVX > 0, "required");
7245 
7246     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
7247     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7248     int vlen_enc = vector_length_encoding(this);
7249     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
7250   %}
7251   ins_pipe( pipe_slow );
7252 %}
7253 
7254 #ifdef _LP64
7255 instruct vround_float_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rRegP scratch, rFlagsReg cr) %{
7256   predicate(!VM_Version::supports_avx512vl() &&
7257             Matcher::vector_length_in_bytes(n) < 64 &&
7258             Matcher::vector_element_basic_type(n) == T_INT);
7259   match(Set dst (RoundVF src));
7260   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP scratch, KILL cr);
7261   format %{ "vector_round_float $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3, $xtmp4 and $scratch as TEMP" %}
7262   ins_encode %{
7263     int vlen_enc = vector_length_encoding(this);
7264     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
7265     __ vector_round_float_avx($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7266                               $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
7267                               ExternalAddress(vector_float_signflip()), new_mxcsr, $scratch$$Register, vlen_enc);
7268   %}
7269   ins_pipe( pipe_slow );
7270 %}
7271 
7272 instruct vround_float_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
7273   predicate((VM_Version::supports_avx512vl() ||
7274              Matcher::vector_length_in_bytes(n) == 64) &&
7275              Matcher::vector_element_basic_type(n) == T_INT);
7276   match(Set dst (RoundVF src));
7277   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr);
7278   format %{ "vector_round_float $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 and $scratch as TEMP" %}
7279   ins_encode %{
7280     int vlen_enc = vector_length_encoding(this);
7281     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
7282     __ vector_round_float_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7283                                $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
7284                                ExternalAddress(vector_float_signflip()), new_mxcsr, $scratch$$Register, vlen_enc);
7285   %}
7286   ins_pipe( pipe_slow );
7287 %}
7288 
7289 instruct vround_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
7290   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
7291   match(Set dst (RoundVD src));
7292   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr);
7293   format %{ "vector_round_long $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1, $ktmp2 and $scratch as TEMP" %}
7294   ins_encode %{
7295     int vlen_enc = vector_length_encoding(this);
7296     InternalAddress new_mxcsr = $constantaddress((jint)0x3F80);
7297     __ vector_round_double_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7298                                 $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
7299                                 ExternalAddress(vector_double_signflip()), new_mxcsr, $scratch$$Register, vlen_enc);
7300   %}
7301   ins_pipe( pipe_slow );
7302 %}
7303 #endif
7304 // --------------------------------- VectorMaskCmp --------------------------------------
7305 
7306 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
7307   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7308             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
7309             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7310             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7311   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7312   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
7313   ins_encode %{
7314     int vlen_enc = vector_length_encoding(this, $src1);
7315     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7316     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7317       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7318     } else {
7319       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7320     }
7321   %}
7322   ins_pipe( pipe_slow );
7323 %}
7324 
7325 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
7326   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
7327             n->bottom_type()->isa_vectmask() == NULL &&
7328             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7329   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7330   effect(TEMP scratch, TEMP ktmp);
7331   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7332   ins_encode %{
7333     int vlen_enc = Assembler::AVX_512bit;
7334     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7335     KRegister mask = k0; // The comparison itself is not being masked.
7336     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7337       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7338       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
7339     } else {
7340       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7341       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
7342     }
7343   %}
7344   ins_pipe( pipe_slow );
7345 %}
7346 
7347 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
7348   predicate(n->bottom_type()->isa_vectmask() &&
7349             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7350   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7351   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
7352   ins_encode %{
7353     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
7354     int vlen_enc = vector_length_encoding(this, $src1);
7355     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7356     KRegister mask = k0; // The comparison itself is not being masked.
7357     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7358       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7359     } else {
7360       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7361     }
7362   %}
7363   ins_pipe( pipe_slow );
7364 %}
7365 
7366 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
7367   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7368             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7369             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
7370             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7371             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
7372             (n->in(2)->get_int() == BoolTest::eq ||
7373              n->in(2)->get_int() == BoolTest::lt ||
7374              n->in(2)->get_int() == BoolTest::gt)); // cond
7375   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7376   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
7377   ins_encode %{
7378     int vlen_enc = vector_length_encoding(this, $src1);
7379     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7380     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
7381     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
7382   %}
7383   ins_pipe( pipe_slow );
7384 %}
7385 
7386 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
7387   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7388             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7389             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
7390             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7391             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
7392             (n->in(2)->get_int() == BoolTest::ne ||
7393              n->in(2)->get_int() == BoolTest::le ||
7394              n->in(2)->get_int() == BoolTest::ge)); // cond
7395   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7396   effect(TEMP dst, TEMP xtmp);
7397   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
7398   ins_encode %{
7399     int vlen_enc = vector_length_encoding(this, $src1);
7400     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7401     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
7402     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
7403   %}
7404   ins_pipe( pipe_slow );
7405 %}
7406 
7407 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
7408   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7409             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7410             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
7411             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7412             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7413   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7414   effect(TEMP dst, TEMP xtmp);
7415   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
7416   ins_encode %{
7417     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
7418     int vlen_enc = vector_length_encoding(this, $src1);
7419     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7420     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
7421 
7422     if (vlen_enc == Assembler::AVX_128bit) {
7423       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
7424     } else {
7425       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
7426     }
7427     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
7428     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7429     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
7430   %}
7431   ins_pipe( pipe_slow );
7432 %}
7433 
7434 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
7435   predicate((n->bottom_type()->isa_vectmask() == NULL &&
7436              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
7437              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7438   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7439   effect(TEMP scratch, TEMP ktmp);
7440   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7441   ins_encode %{
7442     assert(UseAVX > 2, "required");
7443 
7444     int vlen_enc = vector_length_encoding(this, $src1);
7445     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7446     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
7447     KRegister mask = k0; // The comparison itself is not being masked.
7448     bool merge = false;
7449     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
7450 
7451     switch (src1_elem_bt) {
7452       case T_INT: {
7453         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7454         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7455         break;
7456       }
7457       case T_LONG: {
7458         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7459         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7460         break;
7461       }
7462       default: assert(false, "%s", type2name(src1_elem_bt));
7463     }
7464   %}
7465   ins_pipe( pipe_slow );
7466 %}
7467 
7468 
7469 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
7470   predicate(n->bottom_type()->isa_vectmask() &&
7471             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7472   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7473   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
7474   ins_encode %{
7475     assert(UseAVX > 2, "required");
7476     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
7477 
7478     int vlen_enc = vector_length_encoding(this, $src1);
7479     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7480     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
7481     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
7482 
7483     // Comparison i
7484     switch (src1_elem_bt) {
7485       case T_BYTE: {
7486         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7487         break;
7488       }
7489       case T_SHORT: {
7490         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7491         break;
7492       }
7493       case T_INT: {
7494         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7495         break;
7496       }
7497       case T_LONG: {
7498         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7499         break;
7500       }
7501       default: assert(false, "%s", type2name(src1_elem_bt));
7502     }
7503   %}
7504   ins_pipe( pipe_slow );
7505 %}
7506 
7507 // Extract
7508 
7509 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
7510   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
7511   match(Set dst (ExtractI src idx));
7512   match(Set dst (ExtractS src idx));
7513 #ifdef _LP64
7514   match(Set dst (ExtractB src idx));
7515 #endif
7516   format %{ "extractI $dst,$src,$idx\t!" %}
7517   ins_encode %{
7518     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7519 
7520     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7521     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7522   %}
7523   ins_pipe( pipe_slow );
7524 %}
7525 
7526 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
7527   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
7528             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
7529   match(Set dst (ExtractI src idx));
7530   match(Set dst (ExtractS src idx));
7531 #ifdef _LP64
7532   match(Set dst (ExtractB src idx));
7533 #endif
7534   effect(TEMP vtmp);
7535   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
7536   ins_encode %{
7537     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7538 
7539     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7540     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7541     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
7542   %}
7543   ins_pipe( pipe_slow );
7544 %}
7545 
7546 #ifdef _LP64
7547 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
7548   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
7549   match(Set dst (ExtractL src idx));
7550   format %{ "extractL $dst,$src,$idx\t!" %}
7551   ins_encode %{
7552     assert(UseSSE >= 4, "required");
7553     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7554 
7555     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7556   %}
7557   ins_pipe( pipe_slow );
7558 %}
7559 
7560 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
7561   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7562             Matcher::vector_length(n->in(1)) == 8);  // src
7563   match(Set dst (ExtractL src idx));
7564   effect(TEMP vtmp);
7565   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
7566   ins_encode %{
7567     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7568 
7569     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7570     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
7571   %}
7572   ins_pipe( pipe_slow );
7573 %}
7574 #endif
7575 
7576 instruct extractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7577   predicate(Matcher::vector_length(n->in(1)) <= 4);
7578   match(Set dst (ExtractF src idx));
7579   effect(TEMP dst, TEMP tmp, TEMP vtmp);
7580   format %{ "extractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7581   ins_encode %{
7582     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7583 
7584     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $tmp$$Register, $vtmp$$XMMRegister);
7585   %}
7586   ins_pipe( pipe_slow );
7587 %}
7588 
7589 instruct vextractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7590   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
7591             Matcher::vector_length(n->in(1)/*src*/) == 16);
7592   match(Set dst (ExtractF src idx));
7593   effect(TEMP tmp, TEMP vtmp);
7594   format %{ "vextractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7595   ins_encode %{
7596     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7597 
7598     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7599     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant, $tmp$$Register);
7600   %}
7601   ins_pipe( pipe_slow );
7602 %}
7603 
7604 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
7605   predicate(Matcher::vector_length(n->in(1)) == 2); // src
7606   match(Set dst (ExtractD src idx));
7607   format %{ "extractD $dst,$src,$idx\t!" %}
7608   ins_encode %{
7609     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7610 
7611     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7612   %}
7613   ins_pipe( pipe_slow );
7614 %}
7615 
7616 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
7617   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7618             Matcher::vector_length(n->in(1)) == 8);  // src
7619   match(Set dst (ExtractD src idx));
7620   effect(TEMP vtmp);
7621   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
7622   ins_encode %{
7623     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7624 
7625     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7626     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
7627   %}
7628   ins_pipe( pipe_slow );
7629 %}
7630 
7631 // --------------------------------- Vector Blend --------------------------------------
7632 
7633 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
7634   predicate(UseAVX == 0);
7635   match(Set dst (VectorBlend (Binary dst src) mask));
7636   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
7637   effect(TEMP tmp);
7638   ins_encode %{
7639     assert(UseSSE >= 4, "required");
7640 
7641     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
7642       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
7643     }
7644     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
7645   %}
7646   ins_pipe( pipe_slow );
7647 %}
7648 
7649 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
7650   predicate(UseAVX > 0 &&
7651             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
7652             Matcher::vector_length_in_bytes(n) <= 32 &&
7653             is_integral_type(Matcher::vector_element_basic_type(n)));
7654   match(Set dst (VectorBlend (Binary src1 src2) mask));
7655   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7656   ins_encode %{
7657     int vlen_enc = vector_length_encoding(this);
7658     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7659   %}
7660   ins_pipe( pipe_slow );
7661 %}
7662 
7663 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
7664   predicate(UseAVX > 0 &&
7665             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
7666             Matcher::vector_length_in_bytes(n) <= 32 &&
7667             !is_integral_type(Matcher::vector_element_basic_type(n)));
7668   match(Set dst (VectorBlend (Binary src1 src2) mask));
7669   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7670   ins_encode %{
7671     int vlen_enc = vector_length_encoding(this);
7672     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7673   %}
7674   ins_pipe( pipe_slow );
7675 %}
7676 
7677 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, rRegP scratch, kReg ktmp) %{
7678   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
7679             n->in(2)->bottom_type()->isa_vectmask() == NULL);
7680   match(Set dst (VectorBlend (Binary src1 src2) mask));
7681   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
7682   effect(TEMP scratch, TEMP ktmp);
7683   ins_encode %{
7684      int vlen_enc = Assembler::AVX_512bit;
7685      BasicType elem_bt = Matcher::vector_element_basic_type(this);
7686     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, $scratch$$Register);
7687     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
7688   %}
7689   ins_pipe( pipe_slow );
7690 %}
7691 
7692 
7693 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask, rRegP scratch) %{
7694   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
7695             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
7696              VM_Version::supports_avx512bw()));
7697   match(Set dst (VectorBlend (Binary src1 src2) mask));
7698   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
7699   effect(TEMP scratch);
7700   ins_encode %{
7701     int vlen_enc = vector_length_encoding(this);
7702     BasicType elem_bt = Matcher::vector_element_basic_type(this);
7703     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
7704   %}
7705   ins_pipe( pipe_slow );
7706 %}
7707 
7708 // --------------------------------- ABS --------------------------------------
7709 // a = |a|
7710 instruct vabsB_reg(vec dst, vec src) %{
7711   match(Set dst (AbsVB  src));
7712   ins_cost(450);
7713   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
7714   ins_encode %{
7715     uint vlen = Matcher::vector_length(this);
7716     if (vlen <= 16) {
7717       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7718     } else {
7719       int vlen_enc = vector_length_encoding(this);
7720       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7721     }
7722   %}
7723   ins_pipe( pipe_slow );
7724 %}
7725 
7726 instruct vabsS_reg(vec dst, vec src) %{
7727   match(Set dst (AbsVS  src));
7728   ins_cost(450);
7729   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
7730   ins_encode %{
7731     uint vlen = Matcher::vector_length(this);
7732     if (vlen <= 8) {
7733       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7734     } else {
7735       int vlen_enc = vector_length_encoding(this);
7736       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7737     }
7738   %}
7739   ins_pipe( pipe_slow );
7740 %}
7741 
7742 instruct vabsI_reg(vec dst, vec src) %{
7743   match(Set dst (AbsVI  src));
7744   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
7745   ins_cost(250);
7746   ins_encode %{
7747     uint vlen = Matcher::vector_length(this);
7748     if (vlen <= 4) {
7749       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7750     } else {
7751       int vlen_enc = vector_length_encoding(this);
7752       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7753     }
7754   %}
7755   ins_pipe( pipe_slow );
7756 %}
7757 
7758 instruct vabsL_reg(vec dst, vec src) %{
7759   match(Set dst (AbsVL  src));
7760   ins_cost(450);
7761   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
7762   ins_encode %{
7763     assert(UseAVX > 2, "required");
7764     int vlen_enc = vector_length_encoding(this);
7765     if (!VM_Version::supports_avx512vl()) {
7766       vlen_enc = Assembler::AVX_512bit;
7767     }
7768     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7769   %}
7770   ins_pipe( pipe_slow );
7771 %}
7772 
7773 // --------------------------------- ABSNEG --------------------------------------
7774 
7775 instruct vabsnegF(vec dst, vec src, rRegI scratch) %{
7776   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
7777   match(Set dst (AbsVF src));
7778   match(Set dst (NegVF src));
7779   effect(TEMP scratch);
7780   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
7781   ins_cost(150);
7782   ins_encode %{
7783     int opcode = this->ideal_Opcode();
7784     int vlen = Matcher::vector_length(this);
7785     if (vlen == 2) {
7786       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7787     } else {
7788       assert(vlen == 8 || vlen == 16, "required");
7789       int vlen_enc = vector_length_encoding(this);
7790       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7791     }
7792   %}
7793   ins_pipe( pipe_slow );
7794 %}
7795 
7796 instruct vabsneg4F(vec dst, rRegI scratch) %{
7797   predicate(Matcher::vector_length(n) == 4);
7798   match(Set dst (AbsVF dst));
7799   match(Set dst (NegVF dst));
7800   effect(TEMP scratch);
7801   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
7802   ins_cost(150);
7803   ins_encode %{
7804     int opcode = this->ideal_Opcode();
7805     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $scratch$$Register);
7806   %}
7807   ins_pipe( pipe_slow );
7808 %}
7809 
7810 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
7811   match(Set dst (AbsVD  src));
7812   match(Set dst (NegVD  src));
7813   effect(TEMP scratch);
7814   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
7815   ins_encode %{
7816     int opcode = this->ideal_Opcode();
7817     uint vlen = Matcher::vector_length(this);
7818     if (vlen == 2) {
7819       assert(UseSSE >= 2, "required");
7820       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7821     } else {
7822       int vlen_enc = vector_length_encoding(this);
7823       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7824     }
7825   %}
7826   ins_pipe( pipe_slow );
7827 %}
7828 
7829 //------------------------------------- VectorTest --------------------------------------------
7830 
7831 #ifdef _LP64
7832 instruct vptest_alltrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp1, legVec vtmp2, rFlagsReg cr) %{
7833   predicate(!VM_Version::supports_avx512bwdq() &&
7834             Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
7835             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
7836             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7837   match(Set dst (VectorTest src1 src2 ));
7838   effect(TEMP vtmp1, TEMP vtmp2, KILL cr);
7839   format %{ "vptest_alltrue_lt16 $dst,$src1, $src2\t! using $vtmp1, $vtmp2 and $cr as TEMP" %}
7840   ins_encode %{
7841     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7842     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
7843     __ setb(Assembler::carrySet, $dst$$Register);
7844     __ movzbl($dst$$Register, $dst$$Register);
7845   %}
7846   ins_pipe( pipe_slow );
7847 %}
7848 
7849 instruct vptest_alltrue_ge16(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7850   predicate(!VM_Version::supports_avx512bwdq() &&
7851             Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
7852             Matcher::vector_length_in_bytes(n->in(1)) <  64 &&
7853             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7854   match(Set dst (VectorTest src1 src2 ));
7855   effect(KILL cr);
7856   format %{ "vptest_alltrue_ge16  $dst,$src1, $src2\t! using $cr as TEMP" %}
7857   ins_encode %{
7858     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7859     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7860     __ setb(Assembler::carrySet, $dst$$Register);
7861     __ movzbl($dst$$Register, $dst$$Register);
7862   %}
7863   ins_pipe( pipe_slow );
7864 %}
7865 
7866 instruct vptest_alltrue_lt8_evex(rRegI dst, kReg src1, kReg src2, kReg kscratch, rFlagsReg cr) %{
7867   predicate(VM_Version::supports_avx512bwdq() &&
7868             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow &&
7869             n->in(1)->bottom_type()->isa_vectmask() &&
7870             Matcher::vector_length(n->in(1)) < 8);
7871   match(Set dst (VectorTest src1 src2));
7872   effect(KILL cr, TEMP kscratch);
7873   format %{ "vptest_alltrue_lt8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
7874   ins_encode %{
7875     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7876     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7877     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7878     uint masklen = Matcher::vector_length(this, $src1);
7879     __ alltrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister, $kscratch$$KRegister);
7880   %}
7881   ins_pipe( pipe_slow );
7882 %}
7883 
7884 
7885 instruct vptest_alltrue_ge8_evex(rRegI dst, kReg src1, kReg src2, rFlagsReg cr) %{
7886   predicate(VM_Version::supports_avx512bwdq() &&
7887             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow &&
7888             n->in(1)->bottom_type()->isa_vectmask() &&
7889             Matcher::vector_length(n->in(1)) >= 8);
7890   match(Set dst (VectorTest src1 src2));
7891   effect(KILL cr);
7892   format %{ "vptest_alltrue_ge8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
7893   ins_encode %{
7894     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7895     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7896     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7897     uint masklen = Matcher::vector_length(this, $src1);
7898     __ alltrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister, knoreg);
7899   %}
7900   ins_pipe( pipe_slow );
7901 %}
7902 
7903 
7904 instruct vptest_anytrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp, rFlagsReg cr) %{
7905   predicate(!VM_Version::supports_avx512bwdq() &&
7906             Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
7907             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
7908             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7909   match(Set dst (VectorTest src1 src2 ));
7910   effect(TEMP vtmp, KILL cr);
7911   format %{ "vptest_anytrue_lt16 $dst,$src1,$src2\t! using $vtmp, $cr as TEMP" %}
7912   ins_encode %{
7913     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7914     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7915     __ setb(Assembler::notZero, $dst$$Register);
7916     __ movzbl($dst$$Register, $dst$$Register);
7917   %}
7918   ins_pipe( pipe_slow );
7919 %}
7920 
7921 instruct vptest_anytrue_ge16(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7922   predicate(!VM_Version::supports_avx512bwdq() &&
7923             Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
7924             Matcher::vector_length_in_bytes(n->in(1)) < 64  &&
7925             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7926   match(Set dst (VectorTest src1 src2 ));
7927   effect(KILL cr);
7928   format %{ "vptest_anytrue_ge16 $dst,$src1,$src2\t! using $cr as TEMP" %}
7929   ins_encode %{
7930     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7931     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7932     __ setb(Assembler::notZero, $dst$$Register);
7933     __ movzbl($dst$$Register, $dst$$Register);
7934   %}
7935   ins_pipe( pipe_slow );
7936 %}
7937 
7938 instruct vptest_anytrue_evex(rRegI dst, kReg src1, kReg src2, rFlagsReg cr) %{
7939   predicate(VM_Version::supports_avx512bwdq() &&
7940             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7941   match(Set dst (VectorTest src1 src2));
7942   effect(KILL cr);
7943   format %{ "vptest_anytrue_lt8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
7944   ins_encode %{
7945     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7946     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7947     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7948     uint  masklen = Matcher::vector_length(this, $src1);
7949     __ anytrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister);
7950   %}
7951   ins_pipe( pipe_slow );
7952 %}
7953 
7954 instruct cmpvptest_anytrue_lt16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero, legVec vtmp) %{
7955   predicate(!VM_Version::supports_avx512bwdq() &&
7956             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 4 &&
7957             Matcher::vector_length_in_bytes(n->in(1)->in(1)) < 16 &&
7958             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7959   match(Set cr (CmpI (VectorTest src1 src2) zero));
7960   effect(TEMP vtmp);
7961   format %{ "cmpvptest_anytrue_lt16 $src1,$src2\t! using $vtmp as TEMP" %}
7962   ins_encode %{
7963     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7964     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7965   %}
7966   ins_pipe( pipe_slow );
7967 %}
7968 
7969 instruct cmpvptest_anytrue_ge16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero) %{
7970   predicate(!VM_Version::supports_avx512bwdq() &&
7971             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 16 &&
7972             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <  64 &&
7973             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7974   match(Set cr (CmpI (VectorTest src1 src2) zero));
7975   format %{ "cmpvptest_anytrue_ge16 $src1,$src2\t!" %}
7976   ins_encode %{
7977     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7978     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7979   %}
7980   ins_pipe( pipe_slow );
7981 %}
7982 
7983 instruct cmpvptest_anytrue_evex(rFlagsReg cr, kReg src1, kReg src2, immI_0 zero) %{
7984   predicate(VM_Version::supports_avx512bwdq() &&
7985             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7986   match(Set cr (CmpI (VectorTest src1 src2) zero));
7987   format %{ "cmpvptest_anytrue_evex $src1,$src2\t!" %}
7988   ins_encode %{
7989     uint masklen = Matcher::vector_length(this, $src1);
7990     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7991     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7992     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7993     masklen = masklen < 8 ? 8 : masklen;
7994     __ ktest(masklen, $src1$$KRegister, $src2$$KRegister);
7995   %}
7996   ins_pipe( pipe_slow );
7997 %}
7998 #endif
7999 
8000 //------------------------------------- LoadMask --------------------------------------------
8001 
8002 instruct loadMask(legVec dst, legVec src) %{
8003   predicate(n->bottom_type()->isa_vectmask() == NULL && !VM_Version::supports_avx512vlbw());
8004   match(Set dst (VectorLoadMask src));
8005   effect(TEMP dst);
8006   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
8007   ins_encode %{
8008     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8009     BasicType elem_bt = Matcher::vector_element_basic_type(this);
8010     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
8011   %}
8012   ins_pipe( pipe_slow );
8013 %}
8014 
8015 instruct loadMask64(kReg dst, vec src, vec xtmp, rRegI tmp) %{
8016   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
8017   match(Set dst (VectorLoadMask src));
8018   effect(TEMP xtmp, TEMP tmp);
8019   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp and $tmp as TEMP" %}
8020   ins_encode %{
8021     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
8022                         $tmp$$Register, true, Assembler::AVX_512bit);
8023   %}
8024   ins_pipe( pipe_slow );
8025 %}
8026 
8027 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
8028   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
8029   match(Set dst (VectorLoadMask src));
8030   effect(TEMP xtmp);
8031   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
8032   ins_encode %{
8033     int vlen_enc = vector_length_encoding(in(1));
8034     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
8035                         noreg, false, vlen_enc);
8036   %}
8037   ins_pipe( pipe_slow );
8038 %}
8039 
8040 //------------------------------------- StoreMask --------------------------------------------
8041 
8042 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
8043   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8044   match(Set dst (VectorStoreMask src size));
8045   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8046   ins_encode %{
8047     int vlen = Matcher::vector_length(this);
8048     if (vlen <= 16 && UseAVX <= 2) {
8049       assert(UseSSE >= 3, "required");
8050       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
8051     } else {
8052       assert(UseAVX > 0, "required");
8053       int src_vlen_enc = vector_length_encoding(this, $src);
8054       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
8055     }
8056   %}
8057   ins_pipe( pipe_slow );
8058 %}
8059 
8060 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
8061   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8062   match(Set dst (VectorStoreMask src size));
8063   effect(TEMP_DEF dst, TEMP xtmp);
8064   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8065   ins_encode %{
8066     int vlen_enc = Assembler::AVX_128bit;
8067     int vlen = Matcher::vector_length(this);
8068     if (vlen <= 8) {
8069       assert(UseSSE >= 3, "required");
8070       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
8071       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
8072       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
8073     } else {
8074       assert(UseAVX > 0, "required");
8075       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
8076       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8077       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8078     }
8079   %}
8080   ins_pipe( pipe_slow );
8081 %}
8082 
8083 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
8084   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8085   match(Set dst (VectorStoreMask src size));
8086   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8087   effect(TEMP_DEF dst, TEMP xtmp);
8088   ins_encode %{
8089     int vlen_enc = Assembler::AVX_128bit;
8090     int vlen = Matcher::vector_length(this);
8091     if (vlen <= 4) {
8092       assert(UseSSE >= 3, "required");
8093       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
8094       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
8095       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
8096       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
8097     } else {
8098       assert(UseAVX > 0, "required");
8099       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
8100       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
8101       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8102       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
8103       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8104     }
8105   %}
8106   ins_pipe( pipe_slow );
8107 %}
8108 
8109 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
8110   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
8111   match(Set dst (VectorStoreMask src size));
8112   effect(TEMP_DEF dst, TEMP xtmp);
8113   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8114   ins_encode %{
8115     assert(UseSSE >= 3, "required");
8116     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
8117     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
8118     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
8119     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
8120     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
8121   %}
8122   ins_pipe( pipe_slow );
8123 %}
8124 
8125 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
8126   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
8127   match(Set dst (VectorStoreMask src size));
8128   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
8129   effect(TEMP_DEF dst, TEMP vtmp);
8130   ins_encode %{
8131     int vlen_enc = Assembler::AVX_128bit;
8132     __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
8133     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
8134     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
8135     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8136     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8137     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8138     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8139   %}
8140   ins_pipe( pipe_slow );
8141 %}
8142 
8143 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
8144   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8145   match(Set dst (VectorStoreMask src size));
8146   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8147   ins_encode %{
8148     int src_vlen_enc = vector_length_encoding(this, $src);
8149     int dst_vlen_enc = vector_length_encoding(this);
8150     if (!VM_Version::supports_avx512vl()) {
8151       src_vlen_enc = Assembler::AVX_512bit;
8152     }
8153     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
8154     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8155   %}
8156   ins_pipe( pipe_slow );
8157 %}
8158 
8159 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
8160   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8161   match(Set dst (VectorStoreMask src size));
8162   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8163   ins_encode %{
8164     int src_vlen_enc = vector_length_encoding(this, $src);
8165     int dst_vlen_enc = vector_length_encoding(this);
8166     if (!VM_Version::supports_avx512vl()) {
8167       src_vlen_enc = Assembler::AVX_512bit;
8168     }
8169     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
8170     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8171   %}
8172   ins_pipe( pipe_slow );
8173 %}
8174 
8175 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size, rRegI tmp) %{
8176   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
8177   match(Set dst (VectorStoreMask mask size));
8178   effect(TEMP_DEF dst, TEMP tmp);
8179   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
8180   ins_encode %{
8181     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
8182     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
8183                  false, Assembler::AVX_512bit, $tmp$$Register);
8184     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
8185   %}
8186   ins_pipe( pipe_slow );
8187 %}
8188 
8189 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
8190   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
8191   match(Set dst (VectorStoreMask mask size));
8192   effect(TEMP_DEF dst);
8193   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
8194   ins_encode %{
8195     int dst_vlen_enc = vector_length_encoding(this);
8196     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
8197     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8198   %}
8199   ins_pipe( pipe_slow );
8200 %}
8201 
8202 instruct vmaskcast_evex(kReg dst) %{
8203   predicate(Matcher::vector_length(n) == Matcher::vector_length(n->in(1)));
8204   match(Set dst (VectorMaskCast dst));
8205   ins_cost(0);
8206   format %{ "vector_mask_cast $dst" %}
8207   ins_encode %{
8208     // empty
8209   %}
8210   ins_pipe(empty);
8211 %}
8212 
8213 instruct vmaskcast(vec dst) %{
8214   predicate((Matcher::vector_length(n) == Matcher::vector_length(n->in(1))) &&
8215             (Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))));
8216   match(Set dst (VectorMaskCast dst));
8217   ins_cost(0);
8218   format %{ "vector_mask_cast $dst" %}
8219   ins_encode %{
8220     // empty
8221   %}
8222   ins_pipe(empty);
8223 %}
8224 
8225 //-------------------------------- Load Iota Indices ----------------------------------
8226 
8227 instruct loadIotaIndices(vec dst, immI_0 src, rRegP scratch) %{
8228   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
8229   match(Set dst (VectorLoadConst src));
8230   effect(TEMP scratch);
8231   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
8232   ins_encode %{
8233      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8234      __ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen_in_bytes);
8235   %}
8236   ins_pipe( pipe_slow );
8237 %}
8238 
8239 //-------------------------------- Rearrange ----------------------------------
8240 
8241 // LoadShuffle/Rearrange for Byte
8242 
8243 instruct loadShuffleB(vec dst) %{
8244   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
8245   match(Set dst (VectorLoadShuffle dst));
8246   format %{ "vector_load_shuffle $dst, $dst" %}
8247   ins_encode %{
8248     // empty
8249   %}
8250   ins_pipe( pipe_slow );
8251 %}
8252 
8253 instruct rearrangeB(vec dst, vec shuffle) %{
8254   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8255             Matcher::vector_length(n) < 32);
8256   match(Set dst (VectorRearrange dst shuffle));
8257   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8258   ins_encode %{
8259     assert(UseSSE >= 4, "required");
8260     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8261   %}
8262   ins_pipe( pipe_slow );
8263 %}
8264 
8265 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
8266   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8267             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
8268   match(Set dst (VectorRearrange src shuffle));
8269   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
8270   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
8271   ins_encode %{
8272     assert(UseAVX >= 2, "required");
8273     // Swap src into vtmp1
8274     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
8275     // Shuffle swapped src to get entries from other 128 bit lane
8276     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8277     // Shuffle original src to get entries from self 128 bit lane
8278     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8279     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
8280     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
8281     // Perform the blend
8282     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
8283   %}
8284   ins_pipe( pipe_slow );
8285 %}
8286 
8287 instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{
8288   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8289             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
8290   match(Set dst (VectorRearrange src shuffle));
8291   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8292   ins_encode %{
8293     int vlen_enc = vector_length_encoding(this);
8294     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8295   %}
8296   ins_pipe( pipe_slow );
8297 %}
8298 
8299 // LoadShuffle/Rearrange for Short
8300 
8301 instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{
8302   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8303             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
8304   match(Set dst (VectorLoadShuffle src));
8305   effect(TEMP dst, TEMP vtmp, TEMP scratch);
8306   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
8307   ins_encode %{
8308     // Create a byte shuffle mask from short shuffle mask
8309     // only byte shuffle instruction available on these platforms
8310     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8311     if (UseAVX == 0) {
8312       assert(vlen_in_bytes <= 16, "required");
8313       // Multiply each shuffle by two to get byte index
8314       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
8315       __ psllw($vtmp$$XMMRegister, 1);
8316 
8317       // Duplicate to create 2 copies of byte index
8318       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
8319       __ psllw($dst$$XMMRegister, 8);
8320       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
8321 
8322       // Add one to get alternate byte index
8323       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
8324       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
8325     } else {
8326       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
8327       int vlen_enc = vector_length_encoding(this);
8328       // Multiply each shuffle by two to get byte index
8329       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
8330       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
8331 
8332       // Duplicate to create 2 copies of byte index
8333       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
8334       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8335 
8336       // Add one to get alternate byte index
8337       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, $scratch$$Register);
8338     }
8339   %}
8340   ins_pipe( pipe_slow );
8341 %}
8342 
8343 instruct rearrangeS(vec dst, vec shuffle) %{
8344   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8345             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
8346   match(Set dst (VectorRearrange dst shuffle));
8347   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8348   ins_encode %{
8349     assert(UseSSE >= 4, "required");
8350     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8351   %}
8352   ins_pipe( pipe_slow );
8353 %}
8354 
8355 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
8356   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8357             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
8358   match(Set dst (VectorRearrange src shuffle));
8359   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
8360   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
8361   ins_encode %{
8362     assert(UseAVX >= 2, "required");
8363     // Swap src into vtmp1
8364     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
8365     // Shuffle swapped src to get entries from other 128 bit lane
8366     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8367     // Shuffle original src to get entries from self 128 bit lane
8368     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8369     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
8370     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
8371     // Perform the blend
8372     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
8373   %}
8374   ins_pipe( pipe_slow );
8375 %}
8376 
8377 instruct loadShuffleS_evex(vec dst, vec src) %{
8378   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8379             VM_Version::supports_avx512bw());
8380   match(Set dst (VectorLoadShuffle src));
8381   format %{ "vector_load_shuffle $dst, $src" %}
8382   ins_encode %{
8383     int vlen_enc = vector_length_encoding(this);
8384     if (!VM_Version::supports_avx512vl()) {
8385       vlen_enc = Assembler::AVX_512bit;
8386     }
8387     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8388   %}
8389   ins_pipe( pipe_slow );
8390 %}
8391 
8392 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
8393   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8394             VM_Version::supports_avx512bw());
8395   match(Set dst (VectorRearrange src shuffle));
8396   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8397   ins_encode %{
8398     int vlen_enc = vector_length_encoding(this);
8399     if (!VM_Version::supports_avx512vl()) {
8400       vlen_enc = Assembler::AVX_512bit;
8401     }
8402     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8403   %}
8404   ins_pipe( pipe_slow );
8405 %}
8406 
8407 // LoadShuffle/Rearrange for Integer and Float
8408 
8409 instruct loadShuffleI(vec dst, vec src, vec vtmp, rRegP scratch) %{
8410   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8411             Matcher::vector_length(n) == 4 && UseAVX < 2);
8412   match(Set dst (VectorLoadShuffle src));
8413   effect(TEMP dst, TEMP vtmp, TEMP scratch);
8414   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
8415   ins_encode %{
8416     assert(UseSSE >= 4, "required");
8417 
8418     // Create a byte shuffle mask from int shuffle mask
8419     // only byte shuffle instruction available on these platforms
8420 
8421     // Duplicate and multiply each shuffle by 4
8422     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
8423     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
8424     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
8425     __ psllw($vtmp$$XMMRegister, 2);
8426 
8427     // Duplicate again to create 4 copies of byte index
8428     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
8429     __ psllw($dst$$XMMRegister, 8);
8430     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
8431 
8432     // Add 3,2,1,0 to get alternate byte index
8433     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), $scratch$$Register);
8434     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
8435   %}
8436   ins_pipe( pipe_slow );
8437 %}
8438 
8439 instruct rearrangeI(vec dst, vec shuffle) %{
8440  predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8441            Matcher::vector_length(n) == 4 && UseAVX < 2);
8442   match(Set dst (VectorRearrange dst shuffle));
8443   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8444   ins_encode %{
8445     assert(UseSSE >= 4, "required");
8446     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8447   %}
8448   ins_pipe( pipe_slow );
8449 %}
8450 
8451 instruct loadShuffleI_avx(vec dst, vec src) %{
8452   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8453             UseAVX >= 2);
8454   match(Set dst (VectorLoadShuffle src));
8455   format %{ "vector_load_shuffle $dst, $src" %}
8456   ins_encode %{
8457   int vlen_enc = vector_length_encoding(this);
8458     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8459   %}
8460   ins_pipe( pipe_slow );
8461 %}
8462 
8463 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
8464   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8465             UseAVX >= 2);
8466   match(Set dst (VectorRearrange src shuffle));
8467   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8468   ins_encode %{
8469     int vlen_enc = vector_length_encoding(this);
8470     if (vlen_enc == Assembler::AVX_128bit) {
8471       vlen_enc = Assembler::AVX_256bit;
8472     }
8473     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8474   %}
8475   ins_pipe( pipe_slow );
8476 %}
8477 
8478 // LoadShuffle/Rearrange for Long and Double
8479 
8480 instruct loadShuffleL(vec dst, vec src, vec vtmp, rRegP scratch) %{
8481   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8482             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
8483   match(Set dst (VectorLoadShuffle src));
8484   effect(TEMP dst, TEMP vtmp, TEMP scratch);
8485   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
8486   ins_encode %{
8487     assert(UseAVX >= 2, "required");
8488 
8489     int vlen_enc = vector_length_encoding(this);
8490     // Create a double word shuffle mask from long shuffle mask
8491     // only double word shuffle instruction available on these platforms
8492 
8493     // Multiply each shuffle by two to get double word index
8494     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
8495     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
8496 
8497     // Duplicate each double word shuffle
8498     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
8499     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8500 
8501     // Add one to get alternate double word index
8502     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, $scratch$$Register);
8503   %}
8504   ins_pipe( pipe_slow );
8505 %}
8506 
8507 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
8508   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8509             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
8510   match(Set dst (VectorRearrange src shuffle));
8511   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8512   ins_encode %{
8513     assert(UseAVX >= 2, "required");
8514 
8515     int vlen_enc = vector_length_encoding(this);
8516     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8517   %}
8518   ins_pipe( pipe_slow );
8519 %}
8520 
8521 instruct loadShuffleL_evex(vec dst, vec src) %{
8522   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8523             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
8524   match(Set dst (VectorLoadShuffle src));
8525   format %{ "vector_load_shuffle $dst, $src" %}
8526   ins_encode %{
8527     assert(UseAVX > 2, "required");
8528 
8529     int vlen_enc = vector_length_encoding(this);
8530     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8531   %}
8532   ins_pipe( pipe_slow );
8533 %}
8534 
8535 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
8536   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8537             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
8538   match(Set dst (VectorRearrange src shuffle));
8539   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8540   ins_encode %{
8541     assert(UseAVX > 2, "required");
8542 
8543     int vlen_enc = vector_length_encoding(this);
8544     if (vlen_enc == Assembler::AVX_128bit) {
8545       vlen_enc = Assembler::AVX_256bit;
8546     }
8547     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8548   %}
8549   ins_pipe( pipe_slow );
8550 %}
8551 
8552 // --------------------------------- FMA --------------------------------------
8553 // a * b + c
8554 
8555 instruct vfmaF_reg(vec a, vec b, vec c) %{
8556   match(Set c (FmaVF  c (Binary a b)));
8557   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8558   ins_cost(150);
8559   ins_encode %{
8560     assert(UseFMA, "not enabled");
8561     int vlen_enc = vector_length_encoding(this);
8562     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8563   %}
8564   ins_pipe( pipe_slow );
8565 %}
8566 
8567 instruct vfmaF_mem(vec a, memory b, vec c) %{
8568   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8569   match(Set c (FmaVF  c (Binary a (LoadVector b))));
8570   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8571   ins_cost(150);
8572   ins_encode %{
8573     assert(UseFMA, "not enabled");
8574     int vlen_enc = vector_length_encoding(this);
8575     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8576   %}
8577   ins_pipe( pipe_slow );
8578 %}
8579 
8580 instruct vfmaD_reg(vec a, vec b, vec c) %{
8581   match(Set c (FmaVD  c (Binary a b)));
8582   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8583   ins_cost(150);
8584   ins_encode %{
8585     assert(UseFMA, "not enabled");
8586     int vlen_enc = vector_length_encoding(this);
8587     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8588   %}
8589   ins_pipe( pipe_slow );
8590 %}
8591 
8592 instruct vfmaD_mem(vec a, memory b, vec c) %{
8593   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8594   match(Set c (FmaVD  c (Binary a (LoadVector b))));
8595   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8596   ins_cost(150);
8597   ins_encode %{
8598     assert(UseFMA, "not enabled");
8599     int vlen_enc = vector_length_encoding(this);
8600     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8601   %}
8602   ins_pipe( pipe_slow );
8603 %}
8604 
8605 // --------------------------------- Vector Multiply Add --------------------------------------
8606 
8607 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
8608   predicate(UseAVX == 0);
8609   match(Set dst (MulAddVS2VI dst src1));
8610   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
8611   ins_encode %{
8612     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
8613   %}
8614   ins_pipe( pipe_slow );
8615 %}
8616 
8617 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
8618   predicate(UseAVX > 0);
8619   match(Set dst (MulAddVS2VI src1 src2));
8620   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
8621   ins_encode %{
8622     int vlen_enc = vector_length_encoding(this);
8623     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8624   %}
8625   ins_pipe( pipe_slow );
8626 %}
8627 
8628 // --------------------------------- Vector Multiply Add Add ----------------------------------
8629 
8630 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
8631   predicate(VM_Version::supports_avx512_vnni());
8632   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
8633   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
8634   ins_encode %{
8635     assert(UseAVX > 2, "required");
8636     int vlen_enc = vector_length_encoding(this);
8637     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8638   %}
8639   ins_pipe( pipe_slow );
8640   ins_cost(10);
8641 %}
8642 
8643 // --------------------------------- PopCount --------------------------------------
8644 
8645 instruct vpopcountI_popcntd(vec dst, vec src) %{
8646   predicate(VM_Version::supports_avx512_vpopcntdq());
8647   match(Set dst (PopCountVI src));
8648   format %{ "vector_popcount_int $dst, $src\t! vector popcount packedI" %}
8649   ins_encode %{
8650     assert(UsePopCountInstruction, "not enabled");
8651     int vlen_enc = vector_length_encoding(this);
8652     __ vector_popcount_int($dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg, noreg, vlen_enc);
8653   %}
8654   ins_pipe( pipe_slow );
8655 %}
8656 
8657 instruct vpopcountI(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp, rFlagsReg cc) %{
8658   predicate(!VM_Version::supports_avx512_vpopcntdq());
8659   match(Set dst (PopCountVI src));
8660   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, KILL cc);
8661   format %{ "vector_popcount_int  $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
8662   ins_encode %{
8663     assert(UsePopCountInstruction, "not enabled");
8664     int vlen_enc = vector_length_encoding(this);
8665     __ vector_popcount_int($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
8666                            $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
8667   %}
8668   ins_pipe( pipe_slow );
8669 %}
8670 
8671 instruct vpopcountL_popcntd(vec dst, vec src) %{
8672   predicate(VM_Version::supports_avx512_vpopcntdq());
8673   match(Set dst (PopCountVL src));
8674   format %{ "vector_popcount_long  $dst, $src\t! vector popcount packedL" %}
8675   ins_encode %{
8676     assert(UsePopCountInstruction, "not enabled");
8677     int vlen_enc = vector_length_encoding(this, $src);
8678     __ vector_popcount_long($dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg, noreg, vlen_enc);
8679   %}
8680   ins_pipe( pipe_slow );
8681 %}
8682 
8683 instruct vpopcountL(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp, rFlagsReg cc) %{
8684   predicate(!VM_Version::supports_avx512_vpopcntdq());
8685   match(Set dst (PopCountVL src));
8686   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, KILL cc);
8687   format %{ "vector_popcount_long  $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
8688   ins_encode %{
8689     assert(UsePopCountInstruction, "not enabled");
8690     int vlen_enc = vector_length_encoding(this, $src);
8691     __ vector_popcount_long($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
8692                            $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
8693   %}
8694   ins_pipe( pipe_slow );
8695 %}
8696 
8697 // --------------------------------- Bitwise Ternary Logic ----------------------------------
8698 
8699 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
8700   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
8701   effect(TEMP dst);
8702   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8703   ins_encode %{
8704     int vector_len = vector_length_encoding(this);
8705     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
8706   %}
8707   ins_pipe( pipe_slow );
8708 %}
8709 
8710 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
8711   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
8712   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
8713   effect(TEMP dst);
8714   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8715   ins_encode %{
8716     int vector_len = vector_length_encoding(this);
8717     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
8718   %}
8719   ins_pipe( pipe_slow );
8720 %}
8721 
8722 // --------------------------------- Rotation Operations ----------------------------------
8723 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
8724   match(Set dst (RotateLeftV src shift));
8725   match(Set dst (RotateRightV src shift));
8726   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
8727   ins_encode %{
8728     int opcode      = this->ideal_Opcode();
8729     int vector_len  = vector_length_encoding(this);
8730     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8731     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
8732   %}
8733   ins_pipe( pipe_slow );
8734 %}
8735 
8736 instruct vprorate(vec dst, vec src, vec shift) %{
8737   match(Set dst (RotateLeftV src shift));
8738   match(Set dst (RotateRightV src shift));
8739   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
8740   ins_encode %{
8741     int opcode      = this->ideal_Opcode();
8742     int vector_len  = vector_length_encoding(this);
8743     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8744     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8745   %}
8746   ins_pipe( pipe_slow );
8747 %}
8748 
8749 #ifdef _LP64
8750 // ---------------------------------- Masked Operations ------------------------------------
8751 
8752 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
8753   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
8754   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
8755   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
8756   ins_encode %{
8757     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
8758     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
8759 
8760     Label DONE;
8761     int vlen_enc = vector_length_encoding(this, $src1);
8762     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
8763 
8764     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
8765     __ mov64($dst$$Register, -1L);
8766     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
8767     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
8768     __ jccb(Assembler::carrySet, DONE);
8769     __ kmovql($dst$$Register, $ktmp1$$KRegister);
8770     __ notq($dst$$Register);
8771     __ tzcntq($dst$$Register, $dst$$Register);
8772     __ bind(DONE);
8773   %}
8774   ins_pipe( pipe_slow );
8775 %}
8776 
8777 
8778 instruct vmasked_load64(vec dst, memory mem, kReg mask) %{
8779   match(Set dst (LoadVectorMasked mem mask));
8780   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
8781   ins_encode %{
8782     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
8783     int vector_len = vector_length_encoding(this);
8784     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, vector_len);
8785   %}
8786   ins_pipe( pipe_slow );
8787 %}
8788 
8789 instruct vmask_gen(kReg dst, rRegL len, rRegL temp) %{
8790   match(Set dst (VectorMaskGen len));
8791   effect(TEMP temp);
8792   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
8793   ins_encode %{
8794     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
8795   %}
8796   ins_pipe( pipe_slow );
8797 %}
8798 
8799 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
8800   match(Set dst (VectorMaskGen len));
8801   format %{ "vector_mask_gen $len \t! vector mask generator" %}
8802   effect(TEMP temp);
8803   ins_encode %{
8804     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
8805     __ kmovql($dst$$KRegister, $temp$$Register);
8806   %}
8807   ins_pipe( pipe_slow );
8808 %}
8809 
8810 instruct vmasked_store64(memory mem, vec src, kReg mask) %{
8811   match(Set mem (StoreVectorMasked mem (Binary src mask)));
8812   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
8813   ins_encode %{
8814     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
8815     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
8816     int vector_len = vector_length_encoding(src_node);
8817     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, vector_len);
8818   %}
8819   ins_pipe( pipe_slow );
8820 %}
8821 
8822 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
8823   predicate(n->in(1)->bottom_type()->isa_vectmask());
8824   match(Set dst (VectorMaskToLong mask));
8825   effect(TEMP dst, KILL cr);
8826   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
8827   ins_encode %{
8828     int opcode = this->ideal_Opcode();
8829     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8830     int mask_len = Matcher::vector_length(this, $mask);
8831     int mask_size = mask_len * type2aelembytes(mbt);
8832     int vlen_enc = vector_length_encoding(this, $mask);
8833     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
8834                              $dst$$Register, mask_len, mask_size, vlen_enc);
8835   %}
8836   ins_pipe( pipe_slow );
8837 %}
8838 
8839 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
8840   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
8841   match(Set dst (VectorMaskToLong mask));
8842   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
8843   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
8844   ins_encode %{
8845     int opcode = this->ideal_Opcode();
8846     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8847     int mask_len = Matcher::vector_length(this, $mask);
8848     int vlen_enc = vector_length_encoding(this, $mask);
8849     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8850                              $dst$$Register, mask_len, mbt, vlen_enc);
8851   %}
8852   ins_pipe( pipe_slow );
8853 %}
8854 
8855 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
8856   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
8857   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
8858   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
8859   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
8860   ins_encode %{
8861     int opcode = this->ideal_Opcode();
8862     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8863     int mask_len = Matcher::vector_length(this, $mask);
8864     int vlen_enc = vector_length_encoding(this, $mask);
8865     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8866                              $dst$$Register, mask_len, mbt, vlen_enc);
8867   %}
8868   ins_pipe( pipe_slow );
8869 %}
8870 
8871 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
8872   predicate(n->in(1)->bottom_type()->isa_vectmask());
8873   match(Set dst (VectorMaskTrueCount mask));
8874   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
8875   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
8876   ins_encode %{
8877     int opcode = this->ideal_Opcode();
8878     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8879     int mask_len = Matcher::vector_length(this, $mask);
8880     int mask_size = mask_len * type2aelembytes(mbt);
8881     int vlen_enc = vector_length_encoding(this, $mask);
8882     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
8883                              $tmp$$Register, mask_len, mask_size, vlen_enc);
8884   %}
8885   ins_pipe( pipe_slow );
8886 %}
8887 
8888 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
8889   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
8890   match(Set dst (VectorMaskTrueCount mask));
8891   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
8892   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
8893   ins_encode %{
8894     int opcode = this->ideal_Opcode();
8895     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8896     int mask_len = Matcher::vector_length(this, $mask);
8897     int vlen_enc = vector_length_encoding(this, $mask);
8898     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8899                              $tmp$$Register, mask_len, mbt, vlen_enc);
8900   %}
8901   ins_pipe( pipe_slow );
8902 %}
8903 
8904 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
8905   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
8906   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
8907   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
8908   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
8909   ins_encode %{
8910     int opcode = this->ideal_Opcode();
8911     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8912     int mask_len = Matcher::vector_length(this, $mask);
8913     int vlen_enc = vector_length_encoding(this, $mask);
8914     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8915                              $tmp$$Register, mask_len, mbt, vlen_enc);
8916   %}
8917   ins_pipe( pipe_slow );
8918 %}
8919 
8920 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
8921   predicate(n->in(1)->bottom_type()->isa_vectmask());
8922   match(Set dst (VectorMaskFirstTrue mask));
8923   match(Set dst (VectorMaskLastTrue mask));
8924   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
8925   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
8926   ins_encode %{
8927     int opcode = this->ideal_Opcode();
8928     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8929     int mask_len = Matcher::vector_length(this, $mask);
8930     int mask_size = mask_len * type2aelembytes(mbt);
8931     int vlen_enc = vector_length_encoding(this, $mask);
8932     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
8933                              $tmp$$Register, mask_len, mask_size, vlen_enc);
8934   %}
8935   ins_pipe( pipe_slow );
8936 %}
8937 
8938 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
8939   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
8940   match(Set dst (VectorMaskFirstTrue mask));
8941   match(Set dst (VectorMaskLastTrue mask));
8942   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
8943   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
8944   ins_encode %{
8945     int opcode = this->ideal_Opcode();
8946     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8947     int mask_len = Matcher::vector_length(this, $mask);
8948     int vlen_enc = vector_length_encoding(this, $mask);
8949     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8950                              $tmp$$Register, mask_len, mbt, vlen_enc);
8951   %}
8952   ins_pipe( pipe_slow );
8953 %}
8954 
8955 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
8956   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
8957   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
8958   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
8959   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
8960   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
8961   ins_encode %{
8962     int opcode = this->ideal_Opcode();
8963     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8964     int mask_len = Matcher::vector_length(this, $mask);
8965     int vlen_enc = vector_length_encoding(this, $mask);
8966     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8967                              $tmp$$Register, mask_len, mbt, vlen_enc);
8968   %}
8969   ins_pipe( pipe_slow );
8970 %}
8971 #endif // _LP64
8972 
8973 // ---------------------------------- Vector Masked Operations ------------------------------------
8974 
8975 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
8976   match(Set dst (AddVB (Binary dst src2) mask));
8977   match(Set dst (AddVS (Binary dst src2) mask));
8978   match(Set dst (AddVI (Binary dst src2) mask));
8979   match(Set dst (AddVL (Binary dst src2) mask));
8980   match(Set dst (AddVF (Binary dst src2) mask));
8981   match(Set dst (AddVD (Binary dst src2) mask));
8982   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
8983   ins_encode %{
8984     int vlen_enc = vector_length_encoding(this);
8985     BasicType bt = Matcher::vector_element_basic_type(this);
8986     int opc = this->ideal_Opcode();
8987     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8988                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8989   %}
8990   ins_pipe( pipe_slow );
8991 %}
8992 
8993 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
8994   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
8995   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
8996   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
8997   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
8998   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
8999   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
9000   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
9001   ins_encode %{
9002     int vlen_enc = vector_length_encoding(this);
9003     BasicType bt = Matcher::vector_element_basic_type(this);
9004     int opc = this->ideal_Opcode();
9005     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9006                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9007   %}
9008   ins_pipe( pipe_slow );
9009 %}
9010 
9011 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
9012   match(Set dst (XorV (Binary dst src2) mask));
9013   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
9014   ins_encode %{
9015     int vlen_enc = vector_length_encoding(this);
9016     BasicType bt = Matcher::vector_element_basic_type(this);
9017     int opc = this->ideal_Opcode();
9018     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9019                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9020   %}
9021   ins_pipe( pipe_slow );
9022 %}
9023 
9024 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
9025   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
9026   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
9027   ins_encode %{
9028     int vlen_enc = vector_length_encoding(this);
9029     BasicType bt = Matcher::vector_element_basic_type(this);
9030     int opc = this->ideal_Opcode();
9031     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9032                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9033   %}
9034   ins_pipe( pipe_slow );
9035 %}
9036 
9037 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
9038   match(Set dst (OrV (Binary dst src2) mask));
9039   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
9040   ins_encode %{
9041     int vlen_enc = vector_length_encoding(this);
9042     BasicType bt = Matcher::vector_element_basic_type(this);
9043     int opc = this->ideal_Opcode();
9044     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9045                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9046   %}
9047   ins_pipe( pipe_slow );
9048 %}
9049 
9050 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
9051   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
9052   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
9053   ins_encode %{
9054     int vlen_enc = vector_length_encoding(this);
9055     BasicType bt = Matcher::vector_element_basic_type(this);
9056     int opc = this->ideal_Opcode();
9057     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9058                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9059   %}
9060   ins_pipe( pipe_slow );
9061 %}
9062 
9063 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
9064   match(Set dst (AndV (Binary dst src2) mask));
9065   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
9066   ins_encode %{
9067     int vlen_enc = vector_length_encoding(this);
9068     BasicType bt = Matcher::vector_element_basic_type(this);
9069     int opc = this->ideal_Opcode();
9070     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9071                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9072   %}
9073   ins_pipe( pipe_slow );
9074 %}
9075 
9076 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
9077   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
9078   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
9079   ins_encode %{
9080     int vlen_enc = vector_length_encoding(this);
9081     BasicType bt = Matcher::vector_element_basic_type(this);
9082     int opc = this->ideal_Opcode();
9083     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9084                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9085   %}
9086   ins_pipe( pipe_slow );
9087 %}
9088 
9089 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
9090   match(Set dst (SubVB (Binary dst src2) mask));
9091   match(Set dst (SubVS (Binary dst src2) mask));
9092   match(Set dst (SubVI (Binary dst src2) mask));
9093   match(Set dst (SubVL (Binary dst src2) mask));
9094   match(Set dst (SubVF (Binary dst src2) mask));
9095   match(Set dst (SubVD (Binary dst src2) mask));
9096   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
9097   ins_encode %{
9098     int vlen_enc = vector_length_encoding(this);
9099     BasicType bt = Matcher::vector_element_basic_type(this);
9100     int opc = this->ideal_Opcode();
9101     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9102                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9103   %}
9104   ins_pipe( pipe_slow );
9105 %}
9106 
9107 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
9108   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
9109   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
9110   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
9111   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
9112   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
9113   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
9114   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
9115   ins_encode %{
9116     int vlen_enc = vector_length_encoding(this);
9117     BasicType bt = Matcher::vector_element_basic_type(this);
9118     int opc = this->ideal_Opcode();
9119     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9120                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9121   %}
9122   ins_pipe( pipe_slow );
9123 %}
9124 
9125 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
9126   match(Set dst (MulVS (Binary dst src2) mask));
9127   match(Set dst (MulVI (Binary dst src2) mask));
9128   match(Set dst (MulVL (Binary dst src2) mask));
9129   match(Set dst (MulVF (Binary dst src2) mask));
9130   match(Set dst (MulVD (Binary dst src2) mask));
9131   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
9132   ins_encode %{
9133     int vlen_enc = vector_length_encoding(this);
9134     BasicType bt = Matcher::vector_element_basic_type(this);
9135     int opc = this->ideal_Opcode();
9136     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9137                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9138   %}
9139   ins_pipe( pipe_slow );
9140 %}
9141 
9142 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
9143   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
9144   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
9145   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
9146   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
9147   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
9148   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
9149   ins_encode %{
9150     int vlen_enc = vector_length_encoding(this);
9151     BasicType bt = Matcher::vector_element_basic_type(this);
9152     int opc = this->ideal_Opcode();
9153     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9154                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9155   %}
9156   ins_pipe( pipe_slow );
9157 %}
9158 
9159 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
9160   match(Set dst (SqrtVF dst mask));
9161   match(Set dst (SqrtVD dst mask));
9162   ins_cost(100);
9163   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
9164   ins_encode %{
9165     int vlen_enc = vector_length_encoding(this);
9166     BasicType bt = Matcher::vector_element_basic_type(this);
9167     int opc = this->ideal_Opcode();
9168     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9169                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
9170   %}
9171   ins_pipe( pipe_slow );
9172 %}
9173 
9174 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
9175   match(Set dst (DivVF (Binary dst src2) mask));
9176   match(Set dst (DivVD (Binary dst src2) mask));
9177   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
9178   ins_encode %{
9179     int vlen_enc = vector_length_encoding(this);
9180     BasicType bt = Matcher::vector_element_basic_type(this);
9181     int opc = this->ideal_Opcode();
9182     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9183                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9184   %}
9185   ins_pipe( pipe_slow );
9186 %}
9187 
9188 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
9189   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
9190   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
9191   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
9192   ins_encode %{
9193     int vlen_enc = vector_length_encoding(this);
9194     BasicType bt = Matcher::vector_element_basic_type(this);
9195     int opc = this->ideal_Opcode();
9196     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9197                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9198   %}
9199   ins_pipe( pipe_slow );
9200 %}
9201 
9202 
9203 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
9204   match(Set dst (RotateLeftV (Binary dst shift) mask));
9205   match(Set dst (RotateRightV (Binary dst shift) mask));
9206   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
9207   ins_encode %{
9208     int vlen_enc = vector_length_encoding(this);
9209     BasicType bt = Matcher::vector_element_basic_type(this);
9210     int opc = this->ideal_Opcode();
9211     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9212                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9213   %}
9214   ins_pipe( pipe_slow );
9215 %}
9216 
9217 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
9218   match(Set dst (RotateLeftV (Binary dst src2) mask));
9219   match(Set dst (RotateRightV (Binary dst src2) mask));
9220   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
9221   ins_encode %{
9222     int vlen_enc = vector_length_encoding(this);
9223     BasicType bt = Matcher::vector_element_basic_type(this);
9224     int opc = this->ideal_Opcode();
9225     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9226                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9227   %}
9228   ins_pipe( pipe_slow );
9229 %}
9230 
9231 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9232   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
9233   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
9234   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
9235   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
9236   ins_encode %{
9237     int vlen_enc = vector_length_encoding(this);
9238     BasicType bt = Matcher::vector_element_basic_type(this);
9239     int opc = this->ideal_Opcode();
9240     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9241                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9242   %}
9243   ins_pipe( pipe_slow );
9244 %}
9245 
9246 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
9247   predicate(!n->as_ShiftV()->is_var_shift());
9248   match(Set dst (LShiftVS (Binary dst src2) mask));
9249   match(Set dst (LShiftVI (Binary dst src2) mask));
9250   match(Set dst (LShiftVL (Binary dst src2) mask));
9251   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
9252   ins_encode %{
9253     int vlen_enc = vector_length_encoding(this);
9254     BasicType bt = Matcher::vector_element_basic_type(this);
9255     int opc = this->ideal_Opcode();
9256     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9257                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
9258   %}
9259   ins_pipe( pipe_slow );
9260 %}
9261 
9262 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
9263   predicate(n->as_ShiftV()->is_var_shift());
9264   match(Set dst (LShiftVS (Binary dst src2) mask));
9265   match(Set dst (LShiftVI (Binary dst src2) mask));
9266   match(Set dst (LShiftVL (Binary dst src2) mask));
9267   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
9268   ins_encode %{
9269     int vlen_enc = vector_length_encoding(this);
9270     BasicType bt = Matcher::vector_element_basic_type(this);
9271     int opc = this->ideal_Opcode();
9272     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9273                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
9274   %}
9275   ins_pipe( pipe_slow );
9276 %}
9277 
9278 instruct vlshift_mem_masked(vec dst, memory src2, kReg mask) %{
9279   match(Set dst (LShiftVS (Binary dst (LoadVector src2)) mask));
9280   match(Set dst (LShiftVI (Binary dst (LoadVector src2)) mask));
9281   match(Set dst (LShiftVL (Binary dst (LoadVector src2)) mask));
9282   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
9283   ins_encode %{
9284     int vlen_enc = vector_length_encoding(this);
9285     BasicType bt = Matcher::vector_element_basic_type(this);
9286     int opc = this->ideal_Opcode();
9287     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9288                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9289   %}
9290   ins_pipe( pipe_slow );
9291 %}
9292 
9293 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9294   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
9295   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
9296   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
9297   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
9298   ins_encode %{
9299     int vlen_enc = vector_length_encoding(this);
9300     BasicType bt = Matcher::vector_element_basic_type(this);
9301     int opc = this->ideal_Opcode();
9302     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9303                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9304   %}
9305   ins_pipe( pipe_slow );
9306 %}
9307 
9308 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
9309   predicate(!n->as_ShiftV()->is_var_shift());
9310   match(Set dst (RShiftVS (Binary dst src2) mask));
9311   match(Set dst (RShiftVI (Binary dst src2) mask));
9312   match(Set dst (RShiftVL (Binary dst src2) mask));
9313   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
9314   ins_encode %{
9315     int vlen_enc = vector_length_encoding(this);
9316     BasicType bt = Matcher::vector_element_basic_type(this);
9317     int opc = this->ideal_Opcode();
9318     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9319                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
9320   %}
9321   ins_pipe( pipe_slow );
9322 %}
9323 
9324 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
9325   predicate(n->as_ShiftV()->is_var_shift());
9326   match(Set dst (RShiftVS (Binary dst src2) mask));
9327   match(Set dst (RShiftVI (Binary dst src2) mask));
9328   match(Set dst (RShiftVL (Binary dst src2) mask));
9329   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
9330   ins_encode %{
9331     int vlen_enc = vector_length_encoding(this);
9332     BasicType bt = Matcher::vector_element_basic_type(this);
9333     int opc = this->ideal_Opcode();
9334     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9335                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
9336   %}
9337   ins_pipe( pipe_slow );
9338 %}
9339 
9340 instruct vrshift_mem_masked(vec dst, memory src2, kReg mask) %{
9341   match(Set dst (RShiftVS (Binary dst (LoadVector src2)) mask));
9342   match(Set dst (RShiftVI (Binary dst (LoadVector src2)) mask));
9343   match(Set dst (RShiftVL (Binary dst (LoadVector src2)) mask));
9344   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
9345   ins_encode %{
9346     int vlen_enc = vector_length_encoding(this);
9347     BasicType bt = Matcher::vector_element_basic_type(this);
9348     int opc = this->ideal_Opcode();
9349     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9350                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9351   %}
9352   ins_pipe( pipe_slow );
9353 %}
9354 
9355 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9356   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
9357   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
9358   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
9359   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
9360   ins_encode %{
9361     int vlen_enc = vector_length_encoding(this);
9362     BasicType bt = Matcher::vector_element_basic_type(this);
9363     int opc = this->ideal_Opcode();
9364     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9365                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9366   %}
9367   ins_pipe( pipe_slow );
9368 %}
9369 
9370 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
9371   predicate(!n->as_ShiftV()->is_var_shift());
9372   match(Set dst (URShiftVS (Binary dst src2) mask));
9373   match(Set dst (URShiftVI (Binary dst src2) mask));
9374   match(Set dst (URShiftVL (Binary dst src2) mask));
9375   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
9376   ins_encode %{
9377     int vlen_enc = vector_length_encoding(this);
9378     BasicType bt = Matcher::vector_element_basic_type(this);
9379     int opc = this->ideal_Opcode();
9380     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9381                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
9382   %}
9383   ins_pipe( pipe_slow );
9384 %}
9385 
9386 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
9387   predicate(n->as_ShiftV()->is_var_shift());
9388   match(Set dst (URShiftVS (Binary dst src2) mask));
9389   match(Set dst (URShiftVI (Binary dst src2) mask));
9390   match(Set dst (URShiftVL (Binary dst src2) mask));
9391   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
9392   ins_encode %{
9393     int vlen_enc = vector_length_encoding(this);
9394     BasicType bt = Matcher::vector_element_basic_type(this);
9395     int opc = this->ideal_Opcode();
9396     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9397                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
9398   %}
9399   ins_pipe( pipe_slow );
9400 %}
9401 
9402 instruct vurshift_mem_masked(vec dst, memory src2, kReg mask) %{
9403   match(Set dst (URShiftVS (Binary dst (LoadVector src2)) mask));
9404   match(Set dst (URShiftVI (Binary dst (LoadVector src2)) mask));
9405   match(Set dst (URShiftVL (Binary dst (LoadVector src2)) mask));
9406   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
9407   ins_encode %{
9408     int vlen_enc = vector_length_encoding(this);
9409     BasicType bt = Matcher::vector_element_basic_type(this);
9410     int opc = this->ideal_Opcode();
9411     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9412                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9413   %}
9414   ins_pipe( pipe_slow );
9415 %}
9416 
9417 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
9418   match(Set dst (MaxV (Binary dst src2) mask));
9419   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
9420   ins_encode %{
9421     int vlen_enc = vector_length_encoding(this);
9422     BasicType bt = Matcher::vector_element_basic_type(this);
9423     int opc = this->ideal_Opcode();
9424     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9425                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9426   %}
9427   ins_pipe( pipe_slow );
9428 %}
9429 
9430 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
9431   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
9432   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
9433   ins_encode %{
9434     int vlen_enc = vector_length_encoding(this);
9435     BasicType bt = Matcher::vector_element_basic_type(this);
9436     int opc = this->ideal_Opcode();
9437     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9438                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9439   %}
9440   ins_pipe( pipe_slow );
9441 %}
9442 
9443 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
9444   match(Set dst (MinV (Binary dst src2) mask));
9445   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
9446   ins_encode %{
9447     int vlen_enc = vector_length_encoding(this);
9448     BasicType bt = Matcher::vector_element_basic_type(this);
9449     int opc = this->ideal_Opcode();
9450     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9451                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9452   %}
9453   ins_pipe( pipe_slow );
9454 %}
9455 
9456 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
9457   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
9458   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
9459   ins_encode %{
9460     int vlen_enc = vector_length_encoding(this);
9461     BasicType bt = Matcher::vector_element_basic_type(this);
9462     int opc = this->ideal_Opcode();
9463     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9464                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9465   %}
9466   ins_pipe( pipe_slow );
9467 %}
9468 
9469 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
9470   match(Set dst (VectorRearrange (Binary dst src2) mask));
9471   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
9472   ins_encode %{
9473     int vlen_enc = vector_length_encoding(this);
9474     BasicType bt = Matcher::vector_element_basic_type(this);
9475     int opc = this->ideal_Opcode();
9476     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9477                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
9478   %}
9479   ins_pipe( pipe_slow );
9480 %}
9481 
9482 instruct vabs_masked(vec dst, kReg mask) %{
9483   match(Set dst (AbsVB dst mask));
9484   match(Set dst (AbsVS dst mask));
9485   match(Set dst (AbsVI dst mask));
9486   match(Set dst (AbsVL dst mask));
9487   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
9488   ins_cost(100);
9489   ins_encode %{
9490     int vlen_enc = vector_length_encoding(this);
9491     BasicType bt = Matcher::vector_element_basic_type(this);
9492     int opc = this->ideal_Opcode();
9493     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9494                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
9495   %}
9496   ins_pipe( pipe_slow );
9497 %}
9498 
9499 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
9500   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
9501   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
9502   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
9503   ins_encode %{
9504     int vlen_enc = vector_length_encoding(this);
9505     BasicType bt = Matcher::vector_element_basic_type(this);
9506     int opc = this->ideal_Opcode();
9507     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9508                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
9509   %}
9510   ins_pipe( pipe_slow );
9511 %}
9512 
9513 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
9514   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
9515   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
9516   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
9517   ins_encode %{
9518     int vlen_enc = vector_length_encoding(this);
9519     BasicType bt = Matcher::vector_element_basic_type(this);
9520     int opc = this->ideal_Opcode();
9521     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9522                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
9523   %}
9524   ins_pipe( pipe_slow );
9525 %}
9526 
9527 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask, rRegP scratch) %{
9528   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
9529   effect(TEMP scratch);
9530   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask\t! using $scratch as TEMP" %}
9531   ins_encode %{
9532     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
9533     int vlen_enc = vector_length_encoding(this, $src1);
9534     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
9535 
9536     // Comparison i
9537     switch (src1_elem_bt) {
9538       case T_BYTE: {
9539         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9540         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9541         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9542         break;
9543       }
9544       case T_SHORT: {
9545         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9546         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9547         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9548         break;
9549       }
9550       case T_INT: {
9551         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9552         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9553         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9554         break;
9555       }
9556       case T_LONG: {
9557         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9558         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9559         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9560         break;
9561       }
9562       case T_FLOAT: {
9563         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
9564         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
9565         break;
9566       }
9567       case T_DOUBLE: {
9568         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
9569         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
9570         break;
9571       }
9572       default: assert(false, "%s", type2name(src1_elem_bt)); break;
9573     }
9574   %}
9575   ins_pipe( pipe_slow );
9576 %}
9577 
9578 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
9579   predicate(Matcher::vector_length(n) <= 32);
9580   match(Set dst (MaskAll src));
9581   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
9582   ins_encode %{
9583     int mask_len = Matcher::vector_length(this);
9584     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
9585   %}
9586   ins_pipe( pipe_slow );
9587 %}
9588 
9589 #ifdef _LP64
9590 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
9591   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
9592   match(Set dst (XorVMask src (MaskAll cnt)));
9593   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
9594   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
9595   ins_encode %{
9596     uint masklen = Matcher::vector_length(this);
9597     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
9598   %}
9599   ins_pipe( pipe_slow );
9600 %}
9601 
9602 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
9603   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
9604             (Matcher::vector_length(n) == 16) ||
9605             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
9606   match(Set dst (XorVMask src (MaskAll cnt)));
9607   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
9608   ins_encode %{
9609     uint masklen = Matcher::vector_length(this);
9610     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
9611   %}
9612   ins_pipe( pipe_slow );
9613 %}
9614 
9615 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
9616   predicate(n->bottom_type()->isa_vectmask() == NULL && Matcher::vector_length(n) <= 8);
9617   match(Set dst (VectorLongToMask src));
9618   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
9619   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
9620   ins_encode %{
9621     int mask_len = Matcher::vector_length(this);
9622     int vec_enc  = vector_length_encoding(mask_len);
9623     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
9624                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
9625   %}
9626   ins_pipe( pipe_slow );
9627 %}
9628 
9629 
9630 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
9631   predicate(n->bottom_type()->isa_vectmask() == NULL && Matcher::vector_length(n) > 8);
9632   match(Set dst (VectorLongToMask src));
9633   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
9634   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
9635   ins_encode %{
9636     int mask_len = Matcher::vector_length(this);
9637     assert(mask_len <= 32, "invalid mask length");
9638     int vec_enc  = vector_length_encoding(mask_len);
9639     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
9640                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
9641   %}
9642   ins_pipe( pipe_slow );
9643 %}
9644 
9645 instruct long_to_mask_evex(kReg dst, rRegL src) %{
9646   predicate(n->bottom_type()->isa_vectmask());
9647   match(Set dst (VectorLongToMask src));
9648   format %{ "long_to_mask_evex $dst, $src\t!" %}
9649   ins_encode %{
9650     __ kmov($dst$$KRegister, $src$$Register);
9651   %}
9652   ins_pipe( pipe_slow );
9653 %}
9654 #endif
9655 
9656 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
9657   match(Set dst (AndVMask src1 src2));
9658   match(Set dst (OrVMask src1 src2));
9659   match(Set dst (XorVMask src1 src2));
9660   effect(TEMP kscratch);
9661   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
9662   ins_encode %{
9663     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
9664     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
9665     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
9666     uint masklen = Matcher::vector_length(this);
9667     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
9668     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
9669   %}
9670   ins_pipe( pipe_slow );
9671 %}
9672 
9673 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
9674   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
9675   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
9676   ins_encode %{
9677     int vlen_enc = vector_length_encoding(this);
9678     BasicType bt = Matcher::vector_element_basic_type(this);
9679     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
9680                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
9681   %}
9682   ins_pipe( pipe_slow );
9683 %}
9684 
9685 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
9686   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
9687   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
9688   ins_encode %{
9689     int vlen_enc = vector_length_encoding(this);
9690     BasicType bt = Matcher::vector_element_basic_type(this);
9691     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
9692                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
9693   %}
9694   ins_pipe( pipe_slow );
9695 %}
9696 
9697 instruct castMM(kReg dst)
9698 %{
9699   match(Set dst (CastVV dst));
9700 
9701   size(0);
9702   format %{ "# castVV of $dst" %}
9703   ins_encode(/* empty encoding */);
9704   ins_cost(0);
9705   ins_pipe(empty);
9706 %}
9707 
9708 instruct castVV(vec dst)
9709 %{
9710   match(Set dst (CastVV dst));
9711 
9712   size(0);
9713   format %{ "# castVV of $dst" %}
9714   ins_encode(/* empty encoding */);
9715   ins_cost(0);
9716   ins_pipe(empty);
9717 %}
9718 
9719 instruct castVVLeg(legVec dst)
9720 %{
9721   match(Set dst (CastVV dst));
9722 
9723   size(0);
9724   format %{ "# castVV of $dst" %}
9725   ins_encode(/* empty encoding */);
9726   ins_cost(0);
9727   ins_pipe(empty);
9728 %}