1 //
   2 // Copyright (c) 2011, 2022, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 // AVX3 Mask Registers.
 632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
 633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
 634 
 635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
 636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
 637 
 638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
 639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
 640 
 641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
 642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
 643 
 644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
 645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
 646 
 647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
 648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
 649 
 650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
 651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
 652 
 653 
 654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 662 #ifdef _LP64
 663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 687 #endif
 688                       );
 689 
 690 alloc_class chunk2(K7, K7_H,
 691                    K6, K6_H,
 692                    K5, K5_H,
 693                    K4, K4_H,
 694                    K3, K3_H,
 695                    K2, K2_H,
 696                    K1, K1_H);
 697 
 698 reg_class  vectmask_reg(K1, K1_H,
 699                         K2, K2_H,
 700                         K3, K3_H,
 701                         K4, K4_H,
 702                         K5, K5_H,
 703                         K6, K6_H,
 704                         K7, K7_H);
 705 
 706 reg_class vectmask_reg_K1(K1, K1_H);
 707 reg_class vectmask_reg_K2(K2, K2_H);
 708 reg_class vectmask_reg_K3(K3, K3_H);
 709 reg_class vectmask_reg_K4(K4, K4_H);
 710 reg_class vectmask_reg_K5(K5, K5_H);
 711 reg_class vectmask_reg_K6(K6, K6_H);
 712 reg_class vectmask_reg_K7(K7, K7_H);
 713 
 714 // flags allocation class should be last.
 715 alloc_class chunk3(RFLAGS);
 716 
 717 
 718 // Singleton class for condition codes
 719 reg_class int_flags(RFLAGS);
 720 
 721 // Class for pre evex float registers
 722 reg_class float_reg_legacy(XMM0,
 723                     XMM1,
 724                     XMM2,
 725                     XMM3,
 726                     XMM4,
 727                     XMM5,
 728                     XMM6,
 729                     XMM7
 730 #ifdef _LP64
 731                    ,XMM8,
 732                     XMM9,
 733                     XMM10,
 734                     XMM11,
 735                     XMM12,
 736                     XMM13,
 737                     XMM14,
 738                     XMM15
 739 #endif
 740                     );
 741 
 742 // Class for evex float registers
 743 reg_class float_reg_evex(XMM0,
 744                     XMM1,
 745                     XMM2,
 746                     XMM3,
 747                     XMM4,
 748                     XMM5,
 749                     XMM6,
 750                     XMM7
 751 #ifdef _LP64
 752                    ,XMM8,
 753                     XMM9,
 754                     XMM10,
 755                     XMM11,
 756                     XMM12,
 757                     XMM13,
 758                     XMM14,
 759                     XMM15,
 760                     XMM16,
 761                     XMM17,
 762                     XMM18,
 763                     XMM19,
 764                     XMM20,
 765                     XMM21,
 766                     XMM22,
 767                     XMM23,
 768                     XMM24,
 769                     XMM25,
 770                     XMM26,
 771                     XMM27,
 772                     XMM28,
 773                     XMM29,
 774                     XMM30,
 775                     XMM31
 776 #endif
 777                     );
 778 
 779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 781 
 782 // Class for pre evex double registers
 783 reg_class double_reg_legacy(XMM0,  XMM0b,
 784                      XMM1,  XMM1b,
 785                      XMM2,  XMM2b,
 786                      XMM3,  XMM3b,
 787                      XMM4,  XMM4b,
 788                      XMM5,  XMM5b,
 789                      XMM6,  XMM6b,
 790                      XMM7,  XMM7b
 791 #ifdef _LP64
 792                     ,XMM8,  XMM8b,
 793                      XMM9,  XMM9b,
 794                      XMM10, XMM10b,
 795                      XMM11, XMM11b,
 796                      XMM12, XMM12b,
 797                      XMM13, XMM13b,
 798                      XMM14, XMM14b,
 799                      XMM15, XMM15b
 800 #endif
 801                      );
 802 
 803 // Class for evex double registers
 804 reg_class double_reg_evex(XMM0,  XMM0b,
 805                      XMM1,  XMM1b,
 806                      XMM2,  XMM2b,
 807                      XMM3,  XMM3b,
 808                      XMM4,  XMM4b,
 809                      XMM5,  XMM5b,
 810                      XMM6,  XMM6b,
 811                      XMM7,  XMM7b
 812 #ifdef _LP64
 813                     ,XMM8,  XMM8b,
 814                      XMM9,  XMM9b,
 815                      XMM10, XMM10b,
 816                      XMM11, XMM11b,
 817                      XMM12, XMM12b,
 818                      XMM13, XMM13b,
 819                      XMM14, XMM14b,
 820                      XMM15, XMM15b,
 821                      XMM16, XMM16b,
 822                      XMM17, XMM17b,
 823                      XMM18, XMM18b,
 824                      XMM19, XMM19b,
 825                      XMM20, XMM20b,
 826                      XMM21, XMM21b,
 827                      XMM22, XMM22b,
 828                      XMM23, XMM23b,
 829                      XMM24, XMM24b,
 830                      XMM25, XMM25b,
 831                      XMM26, XMM26b,
 832                      XMM27, XMM27b,
 833                      XMM28, XMM28b,
 834                      XMM29, XMM29b,
 835                      XMM30, XMM30b,
 836                      XMM31, XMM31b
 837 #endif
 838                      );
 839 
 840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 842 
 843 // Class for pre evex 32bit vector registers
 844 reg_class vectors_reg_legacy(XMM0,
 845                       XMM1,
 846                       XMM2,
 847                       XMM3,
 848                       XMM4,
 849                       XMM5,
 850                       XMM6,
 851                       XMM7
 852 #ifdef _LP64
 853                      ,XMM8,
 854                       XMM9,
 855                       XMM10,
 856                       XMM11,
 857                       XMM12,
 858                       XMM13,
 859                       XMM14,
 860                       XMM15
 861 #endif
 862                       );
 863 
 864 // Class for evex 32bit vector registers
 865 reg_class vectors_reg_evex(XMM0,
 866                       XMM1,
 867                       XMM2,
 868                       XMM3,
 869                       XMM4,
 870                       XMM5,
 871                       XMM6,
 872                       XMM7
 873 #ifdef _LP64
 874                      ,XMM8,
 875                       XMM9,
 876                       XMM10,
 877                       XMM11,
 878                       XMM12,
 879                       XMM13,
 880                       XMM14,
 881                       XMM15,
 882                       XMM16,
 883                       XMM17,
 884                       XMM18,
 885                       XMM19,
 886                       XMM20,
 887                       XMM21,
 888                       XMM22,
 889                       XMM23,
 890                       XMM24,
 891                       XMM25,
 892                       XMM26,
 893                       XMM27,
 894                       XMM28,
 895                       XMM29,
 896                       XMM30,
 897                       XMM31
 898 #endif
 899                       );
 900 
 901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 903 
 904 // Class for all 64bit vector registers
 905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 906                       XMM1,  XMM1b,
 907                       XMM2,  XMM2b,
 908                       XMM3,  XMM3b,
 909                       XMM4,  XMM4b,
 910                       XMM5,  XMM5b,
 911                       XMM6,  XMM6b,
 912                       XMM7,  XMM7b
 913 #ifdef _LP64
 914                      ,XMM8,  XMM8b,
 915                       XMM9,  XMM9b,
 916                       XMM10, XMM10b,
 917                       XMM11, XMM11b,
 918                       XMM12, XMM12b,
 919                       XMM13, XMM13b,
 920                       XMM14, XMM14b,
 921                       XMM15, XMM15b
 922 #endif
 923                       );
 924 
 925 // Class for all 64bit vector registers
 926 reg_class vectord_reg_evex(XMM0,  XMM0b,
 927                       XMM1,  XMM1b,
 928                       XMM2,  XMM2b,
 929                       XMM3,  XMM3b,
 930                       XMM4,  XMM4b,
 931                       XMM5,  XMM5b,
 932                       XMM6,  XMM6b,
 933                       XMM7,  XMM7b
 934 #ifdef _LP64
 935                      ,XMM8,  XMM8b,
 936                       XMM9,  XMM9b,
 937                       XMM10, XMM10b,
 938                       XMM11, XMM11b,
 939                       XMM12, XMM12b,
 940                       XMM13, XMM13b,
 941                       XMM14, XMM14b,
 942                       XMM15, XMM15b,
 943                       XMM16, XMM16b,
 944                       XMM17, XMM17b,
 945                       XMM18, XMM18b,
 946                       XMM19, XMM19b,
 947                       XMM20, XMM20b,
 948                       XMM21, XMM21b,
 949                       XMM22, XMM22b,
 950                       XMM23, XMM23b,
 951                       XMM24, XMM24b,
 952                       XMM25, XMM25b,
 953                       XMM26, XMM26b,
 954                       XMM27, XMM27b,
 955                       XMM28, XMM28b,
 956                       XMM29, XMM29b,
 957                       XMM30, XMM30b,
 958                       XMM31, XMM31b
 959 #endif
 960                       );
 961 
 962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 964 
 965 // Class for all 128bit vector registers
 966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 973                       XMM7,  XMM7b,  XMM7c,  XMM7d
 974 #ifdef _LP64
 975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 977                       XMM10, XMM10b, XMM10c, XMM10d,
 978                       XMM11, XMM11b, XMM11c, XMM11d,
 979                       XMM12, XMM12b, XMM12c, XMM12d,
 980                       XMM13, XMM13b, XMM13c, XMM13d,
 981                       XMM14, XMM14b, XMM14c, XMM14d,
 982                       XMM15, XMM15b, XMM15c, XMM15d
 983 #endif
 984                       );
 985 
 986 // Class for all 128bit vector registers
 987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 994                       XMM7,  XMM7b,  XMM7c,  XMM7d
 995 #ifdef _LP64
 996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 998                       XMM10, XMM10b, XMM10c, XMM10d,
 999                       XMM11, XMM11b, XMM11c, XMM11d,
1000                       XMM12, XMM12b, XMM12c, XMM12d,
1001                       XMM13, XMM13b, XMM13c, XMM13d,
1002                       XMM14, XMM14b, XMM14c, XMM14d,
1003                       XMM15, XMM15b, XMM15c, XMM15d,
1004                       XMM16, XMM16b, XMM16c, XMM16d,
1005                       XMM17, XMM17b, XMM17c, XMM17d,
1006                       XMM18, XMM18b, XMM18c, XMM18d,
1007                       XMM19, XMM19b, XMM19c, XMM19d,
1008                       XMM20, XMM20b, XMM20c, XMM20d,
1009                       XMM21, XMM21b, XMM21c, XMM21d,
1010                       XMM22, XMM22b, XMM22c, XMM22d,
1011                       XMM23, XMM23b, XMM23c, XMM23d,
1012                       XMM24, XMM24b, XMM24c, XMM24d,
1013                       XMM25, XMM25b, XMM25c, XMM25d,
1014                       XMM26, XMM26b, XMM26c, XMM26d,
1015                       XMM27, XMM27b, XMM27c, XMM27d,
1016                       XMM28, XMM28b, XMM28c, XMM28d,
1017                       XMM29, XMM29b, XMM29c, XMM29d,
1018                       XMM30, XMM30b, XMM30c, XMM30d,
1019                       XMM31, XMM31b, XMM31c, XMM31d
1020 #endif
1021                       );
1022 
1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1025 
1026 // Class for all 256bit vector registers
1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1035 #ifdef _LP64
1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
1044 #endif
1045                       );
1046 
1047 // Class for all 256bit vector registers
1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1056 #ifdef _LP64
1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1081 #endif
1082                       );
1083 
1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1086 
1087 // Class for all 512bit vector registers
1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1096 #ifdef _LP64
1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1121 #endif
1122                       );
1123 
1124 // Class for restricted 512bit vector registers
1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1133 #ifdef _LP64
1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1142 #endif
1143                       );
1144 
1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1147 
1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1149 %}
1150 
1151 
1152 //----------SOURCE BLOCK-------------------------------------------------------
1153 // This is a block of C++ code which provides values, functions, and
1154 // definitions necessary in the rest of the architecture description
1155 
1156 source_hpp %{
1157 // Header information of the source block.
1158 // Method declarations/definitions which are used outside
1159 // the ad-scope can conveniently be defined here.
1160 //
1161 // To keep related declarations/definitions/uses close together,
1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1163 
1164 #include "runtime/vm_version.hpp"
1165 
1166 class NativeJump;
1167 
1168 class CallStubImpl {
1169 
1170   //--------------------------------------------------------------
1171   //---<  Used for optimization in Compile::shorten_branches  >---
1172   //--------------------------------------------------------------
1173 
1174  public:
1175   // Size of call trampoline stub.
1176   static uint size_call_trampoline() {
1177     return 0; // no call trampolines on this platform
1178   }
1179 
1180   // number of relocations needed by a call trampoline stub
1181   static uint reloc_call_trampoline() {
1182     return 0; // no call trampolines on this platform
1183   }
1184 };
1185 
1186 class HandlerImpl {
1187 
1188  public:
1189 
1190   static int emit_exception_handler(CodeBuffer &cbuf);
1191   static int emit_deopt_handler(CodeBuffer& cbuf);
1192 
1193   static uint size_exception_handler() {
1194     // NativeCall instruction size is the same as NativeJump.
1195     // exception handler starts out as jump and can be patched to
1196     // a call be deoptimization.  (4932387)
1197     // Note that this value is also credited (in output.cpp) to
1198     // the size of the code section.
1199     return NativeJump::instruction_size;
1200   }
1201 
1202 #ifdef _LP64
1203   static uint size_deopt_handler() {
1204     // three 5 byte instructions plus one move for unreachable address.
1205     return 15+3;
1206   }
1207 #else
1208   static uint size_deopt_handler() {
1209     // NativeCall instruction size is the same as NativeJump.
1210     // exception handler starts out as jump and can be patched to
1211     // a call be deoptimization.  (4932387)
1212     // Note that this value is also credited (in output.cpp) to
1213     // the size of the code section.
1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1215   }
1216 #endif
1217 };
1218 
1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
1220   switch(bytes) {
1221     case  4: // fall-through
1222     case  8: // fall-through
1223     case 16: return Assembler::AVX_128bit;
1224     case 32: return Assembler::AVX_256bit;
1225     case 64: return Assembler::AVX_512bit;
1226 
1227     default: {
1228       ShouldNotReachHere();
1229       return Assembler::AVX_NoVec;
1230     }
1231   }
1232 }
1233 
1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
1236 }
1237 
1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
1239   uint def_idx = use->operand_index(opnd);
1240   Node* def = use->in(def_idx);
1241   return vector_length_encoding(def);
1242 }
1243 
1244 static inline bool is_unsigned_booltest_pred(int bt) {
1245   return  ((bt & BoolTest::unsigned_compare) == BoolTest::unsigned_compare);
1246 }
1247 
1248 class Node::PD {
1249 public:
1250   enum NodeFlags {
1251     Flag_intel_jcc_erratum = Node::_last_flag << 1,
1252     _last_flag             = Flag_intel_jcc_erratum
1253   };
1254 };
1255 
1256 %} // end source_hpp
1257 
1258 source %{
1259 
1260 #include "opto/addnode.hpp"
1261 #include "c2_intelJccErratum_x86.hpp"
1262 
1263 void PhaseOutput::pd_perform_mach_node_analysis() {
1264   if (VM_Version::has_intel_jcc_erratum()) {
1265     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
1266     _buf_sizes._code += extra_padding;
1267   }
1268 }
1269 
1270 int MachNode::pd_alignment_required() const {
1271   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
1272     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
1273     return IntelJccErratum::largest_jcc_size() + 1;
1274   } else {
1275     return 1;
1276   }
1277 }
1278 
1279 int MachNode::compute_padding(int current_offset) const {
1280   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
1281     Compile* C = Compile::current();
1282     PhaseOutput* output = C->output();
1283     Block* block = output->block();
1284     int index = output->index();
1285     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
1286   } else {
1287     return 0;
1288   }
1289 }
1290 
1291 // Emit exception handler code.
1292 // Stuff framesize into a register and call a VM stub routine.
1293 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1294 
1295   // Note that the code buffer's insts_mark is always relative to insts.
1296   // That's why we must use the macroassembler to generate a handler.
1297   C2_MacroAssembler _masm(&cbuf);
1298   address base = __ start_a_stub(size_exception_handler());
1299   if (base == NULL) {
1300     ciEnv::current()->record_failure("CodeCache is full");
1301     return 0;  // CodeBuffer::expand failed
1302   }
1303   int offset = __ offset();
1304   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1305   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1306   __ end_a_stub();
1307   return offset;
1308 }
1309 
1310 // Emit deopt handler code.
1311 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1312 
1313   // Note that the code buffer's insts_mark is always relative to insts.
1314   // That's why we must use the macroassembler to generate a handler.
1315   C2_MacroAssembler _masm(&cbuf);
1316   address base = __ start_a_stub(size_deopt_handler());
1317   if (base == NULL) {
1318     ciEnv::current()->record_failure("CodeCache is full");
1319     return 0;  // CodeBuffer::expand failed
1320   }
1321   int offset = __ offset();
1322 
1323 #ifdef _LP64
1324   address the_pc = (address) __ pc();
1325   Label next;
1326   // push a "the_pc" on the stack without destroying any registers
1327   // as they all may be live.
1328 
1329   // push address of "next"
1330   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1331   __ bind(next);
1332   // adjust it so it matches "the_pc"
1333   __ subptr(Address(rsp, 0), __ offset() - offset);
1334 #else
1335   InternalAddress here(__ pc());
1336   __ pushptr(here.addr());
1337 #endif
1338 
1339   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1340   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1341   __ end_a_stub();
1342   return offset;
1343 }
1344 
1345 Assembler::Width widthForType(BasicType bt) {
1346   if (bt == T_BYTE) {
1347     return Assembler::B;
1348   } else if (bt == T_SHORT) {
1349     return Assembler::W;
1350   } else if (bt == T_INT) {
1351     return Assembler::D;
1352   } else {
1353     assert(bt == T_LONG, "not a long: %s", type2name(bt));
1354     return Assembler::Q;
1355   }
1356 }
1357 
1358 //=============================================================================
1359 
1360   // Float masks come from different places depending on platform.
1361 #ifdef _LP64
1362   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1363   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1364   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1365   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1366 #else
1367   static address float_signmask()  { return (address)float_signmask_pool; }
1368   static address float_signflip()  { return (address)float_signflip_pool; }
1369   static address double_signmask() { return (address)double_signmask_pool; }
1370   static address double_signflip() { return (address)double_signflip_pool; }
1371 #endif
1372   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1373   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1374   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1375   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1376   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1377   static address vector_int_mask_cmp_bits() { return StubRoutines::x86::vector_int_mask_cmp_bits(); }
1378   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1379   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
1380   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1381   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1382   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1383   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1384   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1385   static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip();}
1386   static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip();}
1387 
1388 //=============================================================================
1389 const bool Matcher::match_rule_supported(int opcode) {
1390   if (!has_match_rule(opcode)) {
1391     return false; // no match rule present
1392   }
1393   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1394   switch (opcode) {
1395     case Op_AbsVL:
1396     case Op_StoreVectorScatter:
1397       if (UseAVX < 3) {
1398         return false;
1399       }
1400       break;
1401     case Op_PopCountI:
1402     case Op_PopCountL:
1403       if (!UsePopCountInstruction) {
1404         return false;
1405       }
1406       break;
1407     case Op_PopCountVI:
1408       if (!UsePopCountInstruction || (UseAVX < 2)) {
1409         return false;
1410       }
1411       break;
1412     case Op_PopCountVL:
1413       if (!UsePopCountInstruction || (UseAVX <= 2)) {
1414         return false;
1415       }
1416       break;
1417     case Op_MulVI:
1418       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
1419         return false;
1420       }
1421       break;
1422     case Op_MulVL:
1423       if (UseSSE < 4) { // only with SSE4_1 or AVX
1424         return false;
1425       }
1426       break;
1427     case Op_MulReductionVL:
1428       if (VM_Version::supports_avx512dq() == false) {
1429         return false;
1430       }
1431       break;
1432     case Op_AddReductionVL:
1433       if (UseSSE < 2) { // requires at least SSE2
1434         return false;
1435       }
1436       break;
1437     case Op_AbsVB:
1438     case Op_AbsVS:
1439     case Op_AbsVI:
1440     case Op_AddReductionVI:
1441     case Op_AndReductionV:
1442     case Op_OrReductionV:
1443     case Op_XorReductionV:
1444       if (UseSSE < 3) { // requires at least SSSE3
1445         return false;
1446       }
1447       break;
1448     case Op_VectorLoadShuffle:
1449     case Op_VectorRearrange:
1450     case Op_MulReductionVI:
1451       if (UseSSE < 4) { // requires at least SSE4
1452         return false;
1453       }
1454       break;
1455     case Op_SqrtVD:
1456     case Op_SqrtVF:
1457     case Op_VectorMaskCmp:
1458     case Op_VectorCastB2X:
1459     case Op_VectorCastS2X:
1460     case Op_VectorCastI2X:
1461     case Op_VectorCastL2X:
1462     case Op_VectorCastF2X:
1463     case Op_VectorCastD2X:
1464     case Op_VectorUCastB2X:
1465     case Op_VectorUCastS2X:
1466     case Op_VectorUCastI2X:
1467       if (UseAVX < 1) { // enabled for AVX only
1468         return false;
1469       }
1470       break;
1471     case Op_CompareAndSwapL:
1472 #ifdef _LP64
1473     case Op_CompareAndSwapP:
1474 #endif
1475       if (!VM_Version::supports_cx8()) {
1476         return false;
1477       }
1478       break;
1479     case Op_CMoveVF:
1480     case Op_CMoveVD:
1481       if (UseAVX < 1) { // enabled for AVX only
1482         return false;
1483       }
1484       break;
1485     case Op_StrIndexOf:
1486       if (!UseSSE42Intrinsics) {
1487         return false;
1488       }
1489       break;
1490     case Op_StrIndexOfChar:
1491       if (!UseSSE42Intrinsics) {
1492         return false;
1493       }
1494       break;
1495     case Op_OnSpinWait:
1496       if (VM_Version::supports_on_spin_wait() == false) {
1497         return false;
1498       }
1499       break;
1500     case Op_MulVB:
1501     case Op_LShiftVB:
1502     case Op_RShiftVB:
1503     case Op_URShiftVB:
1504     case Op_VectorInsert:
1505     case Op_VectorLoadMask:
1506     case Op_VectorStoreMask:
1507     case Op_VectorBlend:
1508       if (UseSSE < 4) {
1509         return false;
1510       }
1511       break;
1512 #ifdef _LP64
1513     case Op_MaxD:
1514     case Op_MaxF:
1515     case Op_MinD:
1516     case Op_MinF:
1517       if (UseAVX < 1) { // enabled for AVX only
1518         return false;
1519       }
1520       break;
1521 #endif
1522     case Op_CacheWB:
1523     case Op_CacheWBPreSync:
1524     case Op_CacheWBPostSync:
1525       if (!VM_Version::supports_data_cache_line_flush()) {
1526         return false;
1527       }
1528       break;
1529     case Op_ExtractB:
1530     case Op_ExtractL:
1531     case Op_ExtractI:
1532     case Op_RoundDoubleMode:
1533       if (UseSSE < 4) {
1534         return false;
1535       }
1536       break;
1537     case Op_RoundDoubleModeV:
1538       if (VM_Version::supports_avx() == false) {
1539         return false; // 128bit vroundpd is not available
1540       }
1541       break;
1542     case Op_LoadVectorGather:
1543       if (UseAVX < 2) {
1544         return false;
1545       }
1546       break;
1547     case Op_FmaVD:
1548     case Op_FmaVF:
1549       if (!UseFMA) {
1550         return false;
1551       }
1552       break;
1553     case Op_MacroLogicV:
1554       if (UseAVX < 3 || !UseVectorMacroLogic) {
1555         return false;
1556       }
1557       break;
1558 
1559     case Op_VectorCmpMasked:
1560     case Op_VectorMaskGen:
1561     case Op_LoadVectorMasked:
1562     case Op_StoreVectorMasked:
1563       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
1564         return false;
1565       }
1566       break;
1567     case Op_VectorMaskFirstTrue:
1568     case Op_VectorMaskLastTrue:
1569     case Op_VectorMaskTrueCount:
1570     case Op_VectorMaskToLong:
1571       if (!is_LP64 || UseAVX < 1) {
1572          return false;
1573       }
1574       break;
1575     case Op_CopySignD:
1576     case Op_CopySignF:
1577       if (UseAVX < 3 || !is_LP64)  {
1578         return false;
1579       }
1580       if (!VM_Version::supports_avx512vl()) {
1581         return false;
1582       }
1583       break;
1584 #ifndef _LP64
1585     case Op_AddReductionVF:
1586     case Op_AddReductionVD:
1587     case Op_MulReductionVF:
1588     case Op_MulReductionVD:
1589       if (UseSSE < 1) { // requires at least SSE
1590         return false;
1591       }
1592       break;
1593     case Op_MulAddVS2VI:
1594     case Op_RShiftVL:
1595     case Op_AbsVD:
1596     case Op_NegVD:
1597       if (UseSSE < 2) {
1598         return false;
1599       }
1600       break;
1601 #endif // !LP64
1602     case Op_SignumF:
1603       if (UseSSE < 1) {
1604         return false;
1605       }
1606       break;
1607     case Op_SignumD:
1608       if (UseSSE < 2) {
1609         return false;
1610       }
1611       break;
1612     case Op_SqrtF:
1613       if (UseSSE < 1) {
1614         return false;
1615       }
1616       break;
1617     case Op_SqrtD:
1618 #ifdef _LP64
1619       if (UseSSE < 2) {
1620         return false;
1621       }
1622 #else
1623       // x86_32.ad has a special match rule for SqrtD.
1624       // Together with common x86 rules, this handles all UseSSE cases.
1625 #endif
1626       break;
1627   }
1628   return true;  // Match rules are supported by default.
1629 }
1630 
1631 //------------------------------------------------------------------------
1632 
1633 // Identify extra cases that we might want to provide match rules for vector nodes and
1634 // other intrinsics guarded with vector length (vlen) and element type (bt).
1635 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
1636   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1637   if (!match_rule_supported(opcode)) {
1638     return false;
1639   }
1640   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
1641   //   * SSE2 supports 128bit vectors for all types;
1642   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
1643   //   * AVX2 supports 256bit vectors for all types;
1644   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
1645   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1646   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1647   // And MaxVectorSize is taken into account as well.
1648   if (!vector_size_supported(bt, vlen)) {
1649     return false;
1650   }
1651   // Special cases which require vector length follow:
1652   //   * implementation limitations
1653   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1654   //   * 128bit vroundpd instruction is present only in AVX1
1655   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1656   switch (opcode) {
1657     case Op_AbsVF:
1658     case Op_NegVF:
1659       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1660         return false; // 512bit vandps and vxorps are not available
1661       }
1662       break;
1663     case Op_AbsVD:
1664     case Op_NegVD:
1665     case Op_MulVL:
1666       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1667         return false; // 512bit vpmullq, vandpd and vxorpd are not available
1668       }
1669       break;
1670     case Op_CMoveVF:
1671       if (vlen != 8) {
1672         return false; // implementation limitation (only vcmov8F_reg is present)
1673       }
1674       break;
1675     case Op_RotateRightV:
1676     case Op_RotateLeftV:
1677       if (bt != T_INT && bt != T_LONG) {
1678         return false;
1679       } // fallthrough
1680     case Op_MacroLogicV:
1681       if (!VM_Version::supports_evex() ||
1682           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
1683         return false;
1684       }
1685       break;
1686     case Op_ClearArray:
1687     case Op_VectorMaskGen:
1688     case Op_VectorCmpMasked:
1689     case Op_LoadVectorMasked:
1690     case Op_StoreVectorMasked:
1691       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
1692         return false;
1693       }
1694       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
1695         return false;
1696       }
1697       break;
1698     case Op_CMoveVD:
1699       if (vlen != 4) {
1700         return false; // implementation limitation (only vcmov4D_reg is present)
1701       }
1702       break;
1703     case Op_MaxV:
1704     case Op_MinV:
1705       if (UseSSE < 4 && is_integral_type(bt)) {
1706         return false;
1707       }
1708       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
1709           // Float/Double intrinsics are enabled for AVX family currently.
1710           if (UseAVX == 0) {
1711             return false;
1712           }
1713           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
1714             return false;
1715           }
1716       }
1717       break;
1718     case Op_CallLeafVector:
1719       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
1720         return false;
1721       }
1722       break;
1723     case Op_AddReductionVI:
1724       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
1725         return false;
1726       }
1727       // fallthrough
1728     case Op_AndReductionV:
1729     case Op_OrReductionV:
1730     case Op_XorReductionV:
1731       if (is_subword_type(bt) && (UseSSE < 4)) {
1732         return false;
1733       }
1734 #ifndef _LP64
1735       if (bt == T_BYTE || bt == T_LONG) {
1736         return false;
1737       }
1738 #endif
1739       break;
1740 #ifndef _LP64
1741     case Op_VectorInsert:
1742       if (bt == T_LONG || bt == T_DOUBLE) {
1743         return false;
1744       }
1745       break;
1746 #endif
1747     case Op_MinReductionV:
1748     case Op_MaxReductionV:
1749       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
1750         return false;
1751       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
1752         return false;
1753       }
1754       // Float/Double intrinsics enabled for AVX family.
1755       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
1756         return false;
1757       }
1758       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
1759         return false;
1760       }
1761 #ifndef _LP64
1762       if (bt == T_BYTE || bt == T_LONG) {
1763         return false;
1764       }
1765 #endif
1766       break;
1767     case Op_VectorTest:
1768       if (UseSSE < 4) {
1769         return false; // Implementation limitation
1770       } else if (size_in_bits < 32) {
1771         return false; // Implementation limitation
1772       } else if (size_in_bits == 512 && (VM_Version::supports_avx512bw() == false)) {
1773         return false; // Implementation limitation
1774       }
1775       break;
1776     case Op_VectorLoadShuffle:
1777     case Op_VectorRearrange:
1778       if(vlen == 2) {
1779         return false; // Implementation limitation due to how shuffle is loaded
1780       } else if (size_in_bits == 256 && UseAVX < 2) {
1781         return false; // Implementation limitation
1782       } else if (bt == T_BYTE && size_in_bits > 256 && !VM_Version::supports_avx512_vbmi())  {
1783         return false; // Implementation limitation
1784       } else if (bt == T_SHORT && size_in_bits > 256 && !VM_Version::supports_avx512bw())  {
1785         return false; // Implementation limitation
1786       }
1787       break;
1788     case Op_VectorLoadMask:
1789       if (size_in_bits == 256 && UseAVX < 2) {
1790         return false; // Implementation limitation
1791       }
1792       // fallthrough
1793     case Op_VectorStoreMask:
1794       if (vlen == 2) {
1795         return false; // Implementation limitation
1796       }
1797       break;
1798     case Op_VectorCastB2X:
1799     case Op_VectorCastS2X:
1800     case Op_VectorCastI2X:
1801       if (bt != T_DOUBLE && size_in_bits == 256 && UseAVX < 2) {
1802         return false;
1803       }
1804       break;
1805     case Op_VectorCastL2X:
1806       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1807         return false;
1808       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
1809         return false;
1810       }
1811       break;
1812     case Op_VectorCastD2X:
1813       if (is_subword_type(bt) || bt == T_INT) {
1814         return false;
1815       }
1816       if (bt == T_LONG && !VM_Version::supports_avx512dq()) {
1817         return false;
1818       }
1819       break;
1820     case Op_VectorCastF2X:
1821       if (is_subword_type(bt) || bt == T_LONG) {
1822         return false;
1823       }
1824       break;
1825     case Op_MulReductionVI:
1826       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
1827         return false;
1828       }
1829       break;
1830     case Op_LoadVectorGatherMasked:
1831     case Op_StoreVectorScatterMasked:
1832     case Op_StoreVectorScatter:
1833       if(is_subword_type(bt)) {
1834         return false;
1835       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1836         return false;
1837       }
1838       // fallthrough
1839     case Op_LoadVectorGather:
1840       if (size_in_bits == 64 ) {
1841         return false;
1842       }
1843       break;
1844     case Op_MaskAll:
1845       if (!VM_Version::supports_evex()) {
1846         return false;
1847       }
1848       if ((vlen > 16 || is_subword_type(bt)) && !VM_Version::supports_avx512bw()) {
1849         return false;
1850       }
1851       if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1852         return false;
1853       }
1854       break;
1855     case Op_VectorMaskCmp:
1856       if (vlen < 2 || size_in_bits < 32) {
1857         return false;
1858       }
1859       break;
1860     case Op_VectorLongToMask:
1861       if (UseAVX < 1 || !is_LP64) {
1862         return false;
1863       }
1864       if (UseAVX < 3 && !VM_Version::supports_bmi2()) {
1865         return false;
1866       }
1867       break;
1868     case Op_PopCountVI:
1869       if (!VM_Version::supports_avx512_vpopcntdq() &&
1870           (vlen == 16) && !VM_Version::supports_avx512bw()) {
1871         return false;
1872       }
1873       break;
1874     case Op_PopCountVL:
1875       if (!VM_Version::supports_avx512_vpopcntdq() &&
1876           ((vlen <= 4) || ((vlen == 8) && !VM_Version::supports_avx512bw()))) {
1877         return false;
1878       }
1879       break;
1880   }
1881   return true;  // Per default match rules are supported.
1882 }
1883 
1884 const bool Matcher::match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt) {
1885   // ADLC based match_rule_supported routine checks for the existence of pattern based
1886   // on IR opcode. Most of the unary/binary/ternary masked operation share the IR nodes
1887   // of their non-masked counterpart with mask edge being the differentiator.
1888   // This routine does a strict check on the existence of masked operation patterns
1889   // by returning a default false value for all the other opcodes apart from the
1890   // ones whose masked instruction patterns are defined in this file.
1891   if (!match_rule_supported_vector(opcode, vlen, bt)) {
1892     return false;
1893   }
1894 
1895   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1896   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1897   if (size_in_bits != 512 && !VM_Version::supports_avx512vl()) {
1898     return false;
1899   }
1900   switch(opcode) {
1901     // Unary masked operations
1902     case Op_AbsVB:
1903     case Op_AbsVS:
1904       if(!VM_Version::supports_avx512bw()) {
1905         return false;  // Implementation limitation
1906       }
1907     case Op_AbsVI:
1908     case Op_AbsVL:
1909       return true;
1910 
1911     // Ternary masked operations
1912     case Op_FmaVF:
1913     case Op_FmaVD:
1914       return true;
1915 
1916     case Op_MacroLogicV:
1917       if(bt != T_INT && bt != T_LONG) {
1918         return false;
1919       }
1920       return true;
1921 
1922     // Binary masked operations
1923     case Op_AddVB:
1924     case Op_AddVS:
1925     case Op_SubVB:
1926     case Op_SubVS:
1927     case Op_MulVS:
1928     case Op_LShiftVS:
1929     case Op_RShiftVS:
1930     case Op_URShiftVS:
1931       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
1932       if (!VM_Version::supports_avx512bw()) {
1933         return false;  // Implementation limitation
1934       }
1935       return true;
1936 
1937     case Op_MulVL:
1938       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
1939       if (!VM_Version::supports_avx512dq()) {
1940         return false;  // Implementation limitation
1941       }
1942       return true;
1943 
1944     case Op_AndV:
1945     case Op_OrV:
1946     case Op_XorV:
1947     case Op_RotateRightV:
1948     case Op_RotateLeftV:
1949       if (bt != T_INT && bt != T_LONG) {
1950         return false; // Implementation limitation
1951       }
1952       return true;
1953 
1954     case Op_VectorLoadMask:
1955       assert(size_in_bits == 512 || VM_Version::supports_avx512vl(), "");
1956       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
1957         return false;
1958       }
1959       return true;
1960 
1961     case Op_AddVI:
1962     case Op_AddVL:
1963     case Op_AddVF:
1964     case Op_AddVD:
1965     case Op_SubVI:
1966     case Op_SubVL:
1967     case Op_SubVF:
1968     case Op_SubVD:
1969     case Op_MulVI:
1970     case Op_MulVF:
1971     case Op_MulVD:
1972     case Op_DivVF:
1973     case Op_DivVD:
1974     case Op_SqrtVF:
1975     case Op_SqrtVD:
1976     case Op_LShiftVI:
1977     case Op_LShiftVL:
1978     case Op_RShiftVI:
1979     case Op_RShiftVL:
1980     case Op_URShiftVI:
1981     case Op_URShiftVL:
1982     case Op_LoadVectorMasked:
1983     case Op_StoreVectorMasked:
1984     case Op_LoadVectorGatherMasked:
1985     case Op_StoreVectorScatterMasked:
1986       return true;
1987 
1988     case Op_MaxV:
1989     case Op_MinV:
1990       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
1991         return false; // Implementation limitation
1992       }
1993       if (is_floating_point_type(bt)) {
1994         return false; // Implementation limitation
1995       }
1996       return true;
1997 
1998     case Op_VectorMaskCmp:
1999       if (is_subword_type(bt) && !VM_Version::supports_avx512bw()) {
2000         return false; // Implementation limitation
2001       }
2002       return true;
2003 
2004     case Op_VectorRearrange:
2005       if (bt == T_SHORT && !VM_Version::supports_avx512bw()) {
2006         return false; // Implementation limitation
2007       }
2008       if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
2009         return false; // Implementation limitation
2010       } else if ((bt == T_INT || bt == T_FLOAT) && size_in_bits < 256) {
2011         return false; // Implementation limitation
2012       }
2013       return true;
2014 
2015     // Binary Logical operations
2016     case Op_AndVMask:
2017     case Op_OrVMask:
2018     case Op_XorVMask:
2019       if (vlen > 16 && !VM_Version::supports_avx512bw()) {
2020         return false; // Implementation limitation
2021       }
2022       return true;
2023 
2024     case Op_MaskAll:
2025       return true;
2026 
2027     default:
2028       return false;
2029   }
2030 }
2031 
2032 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
2033   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
2034   bool legacy = (generic_opnd->opcode() == LEGVEC);
2035   if (!VM_Version::supports_avx512vlbwdq() && // KNL
2036       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
2037     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
2038     return new legVecZOper();
2039   }
2040   if (legacy) {
2041     switch (ideal_reg) {
2042       case Op_VecS: return new legVecSOper();
2043       case Op_VecD: return new legVecDOper();
2044       case Op_VecX: return new legVecXOper();
2045       case Op_VecY: return new legVecYOper();
2046       case Op_VecZ: return new legVecZOper();
2047     }
2048   } else {
2049     switch (ideal_reg) {
2050       case Op_VecS: return new vecSOper();
2051       case Op_VecD: return new vecDOper();
2052       case Op_VecX: return new vecXOper();
2053       case Op_VecY: return new vecYOper();
2054       case Op_VecZ: return new vecZOper();
2055     }
2056   }
2057   ShouldNotReachHere();
2058   return NULL;
2059 }
2060 
2061 bool Matcher::is_reg2reg_move(MachNode* m) {
2062   switch (m->rule()) {
2063     case MoveVec2Leg_rule:
2064     case MoveLeg2Vec_rule:
2065     case MoveF2VL_rule:
2066     case MoveF2LEG_rule:
2067     case MoveVL2F_rule:
2068     case MoveLEG2F_rule:
2069     case MoveD2VL_rule:
2070     case MoveD2LEG_rule:
2071     case MoveVL2D_rule:
2072     case MoveLEG2D_rule:
2073       return true;
2074     default:
2075       return false;
2076   }
2077 }
2078 
2079 bool Matcher::is_generic_vector(MachOper* opnd) {
2080   switch (opnd->opcode()) {
2081     case VEC:
2082     case LEGVEC:
2083       return true;
2084     default:
2085       return false;
2086   }
2087 }
2088 
2089 //------------------------------------------------------------------------
2090 
2091 const RegMask* Matcher::predicate_reg_mask(void) {
2092   return &_VECTMASK_REG_mask;
2093 }
2094 
2095 const TypeVectMask* Matcher::predicate_reg_type(const Type* elemTy, int length) {
2096   return new TypeVectMask(elemTy, length);
2097 }
2098 
2099 // Max vector size in bytes. 0 if not supported.
2100 const int Matcher::vector_width_in_bytes(BasicType bt) {
2101   assert(is_java_primitive(bt), "only primitive type vectors");
2102   if (UseSSE < 2) return 0;
2103   // SSE2 supports 128bit vectors for all types.
2104   // AVX2 supports 256bit vectors for all types.
2105   // AVX2/EVEX supports 512bit vectors for all types.
2106   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
2107   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
2108   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
2109     size = (UseAVX > 2) ? 64 : 32;
2110   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
2111     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
2112   // Use flag to limit vector size.
2113   size = MIN2(size,(int)MaxVectorSize);
2114   // Minimum 2 values in vector (or 4 for bytes).
2115   switch (bt) {
2116   case T_DOUBLE:
2117   case T_LONG:
2118     if (size < 16) return 0;
2119     break;
2120   case T_FLOAT:
2121   case T_INT:
2122     if (size < 8) return 0;
2123     break;
2124   case T_BOOLEAN:
2125     if (size < 4) return 0;
2126     break;
2127   case T_CHAR:
2128     if (size < 4) return 0;
2129     break;
2130   case T_BYTE:
2131     if (size < 4) return 0;
2132     break;
2133   case T_SHORT:
2134     if (size < 4) return 0;
2135     break;
2136   default:
2137     ShouldNotReachHere();
2138   }
2139   return size;
2140 }
2141 
2142 // Limits on vector size (number of elements) loaded into vector.
2143 const int Matcher::max_vector_size(const BasicType bt) {
2144   return vector_width_in_bytes(bt)/type2aelembytes(bt);
2145 }
2146 const int Matcher::min_vector_size(const BasicType bt) {
2147   int max_size = max_vector_size(bt);
2148   // Min size which can be loaded into vector is 4 bytes.
2149   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
2150   // Support for calling svml double64 vectors
2151   if (bt == T_DOUBLE) {
2152     size = 1;
2153   }
2154   return MIN2(size,max_size);
2155 }
2156 
2157 const int Matcher::scalable_vector_reg_size(const BasicType bt) {
2158   return -1;
2159 }
2160 
2161 // Vector ideal reg corresponding to specified size in bytes
2162 const uint Matcher::vector_ideal_reg(int size) {
2163   assert(MaxVectorSize >= size, "");
2164   switch(size) {
2165     case  4: return Op_VecS;
2166     case  8: return Op_VecD;
2167     case 16: return Op_VecX;
2168     case 32: return Op_VecY;
2169     case 64: return Op_VecZ;
2170   }
2171   ShouldNotReachHere();
2172   return 0;
2173 }
2174 
2175 // Check for shift by small constant as well
2176 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
2177   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
2178       shift->in(2)->get_int() <= 3 &&
2179       // Are there other uses besides address expressions?
2180       !matcher->is_visited(shift)) {
2181     address_visited.set(shift->_idx); // Flag as address_visited
2182     mstack.push(shift->in(2), Matcher::Visit);
2183     Node *conv = shift->in(1);
2184 #ifdef _LP64
2185     // Allow Matcher to match the rule which bypass
2186     // ConvI2L operation for an array index on LP64
2187     // if the index value is positive.
2188     if (conv->Opcode() == Op_ConvI2L &&
2189         conv->as_Type()->type()->is_long()->_lo >= 0 &&
2190         // Are there other uses besides address expressions?
2191         !matcher->is_visited(conv)) {
2192       address_visited.set(conv->_idx); // Flag as address_visited
2193       mstack.push(conv->in(1), Matcher::Pre_Visit);
2194     } else
2195 #endif
2196       mstack.push(conv, Matcher::Pre_Visit);
2197     return true;
2198   }
2199   return false;
2200 }
2201 
2202 // This function identifies sub-graphs in which a 'load' node is
2203 // input to two different nodes, and such that it can be matched
2204 // with BMI instructions like blsi, blsr, etc.
2205 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
2206 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
2207 // refers to the same node.
2208 //
2209 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
2210 // This is a temporary solution until we make DAGs expressible in ADL.
2211 template<typename ConType>
2212 class FusedPatternMatcher {
2213   Node* _op1_node;
2214   Node* _mop_node;
2215   int _con_op;
2216 
2217   static int match_next(Node* n, int next_op, int next_op_idx) {
2218     if (n->in(1) == NULL || n->in(2) == NULL) {
2219       return -1;
2220     }
2221 
2222     if (next_op_idx == -1) { // n is commutative, try rotations
2223       if (n->in(1)->Opcode() == next_op) {
2224         return 1;
2225       } else if (n->in(2)->Opcode() == next_op) {
2226         return 2;
2227       }
2228     } else {
2229       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
2230       if (n->in(next_op_idx)->Opcode() == next_op) {
2231         return next_op_idx;
2232       }
2233     }
2234     return -1;
2235   }
2236 
2237  public:
2238   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
2239     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
2240 
2241   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
2242              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
2243              typename ConType::NativeType con_value) {
2244     if (_op1_node->Opcode() != op1) {
2245       return false;
2246     }
2247     if (_mop_node->outcnt() > 2) {
2248       return false;
2249     }
2250     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
2251     if (op1_op2_idx == -1) {
2252       return false;
2253     }
2254     // Memory operation must be the other edge
2255     int op1_mop_idx = (op1_op2_idx & 1) + 1;
2256 
2257     // Check that the mop node is really what we want
2258     if (_op1_node->in(op1_mop_idx) == _mop_node) {
2259       Node* op2_node = _op1_node->in(op1_op2_idx);
2260       if (op2_node->outcnt() > 1) {
2261         return false;
2262       }
2263       assert(op2_node->Opcode() == op2, "Should be");
2264       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
2265       if (op2_con_idx == -1) {
2266         return false;
2267       }
2268       // Memory operation must be the other edge
2269       int op2_mop_idx = (op2_con_idx & 1) + 1;
2270       // Check that the memory operation is the same node
2271       if (op2_node->in(op2_mop_idx) == _mop_node) {
2272         // Now check the constant
2273         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
2274         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
2275           return true;
2276         }
2277       }
2278     }
2279     return false;
2280   }
2281 };
2282 
2283 static bool is_bmi_pattern(Node* n, Node* m) {
2284   assert(UseBMI1Instructions, "sanity");
2285   if (n != NULL && m != NULL) {
2286     if (m->Opcode() == Op_LoadI) {
2287       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
2288       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
2289              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
2290              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
2291     } else if (m->Opcode() == Op_LoadL) {
2292       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
2293       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
2294              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
2295              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
2296     }
2297   }
2298   return false;
2299 }
2300 
2301 // Should the matcher clone input 'm' of node 'n'?
2302 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
2303   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
2304   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
2305     mstack.push(m, Visit);
2306     return true;
2307   }
2308   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
2309     mstack.push(m, Visit);           // m = ShiftCntV
2310     return true;
2311   }
2312   return false;
2313 }
2314 
2315 // Should the Matcher clone shifts on addressing modes, expecting them
2316 // to be subsumed into complex addressing expressions or compute them
2317 // into registers?
2318 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
2319   Node *off = m->in(AddPNode::Offset);
2320   if (off->is_Con()) {
2321     address_visited.test_set(m->_idx); // Flag as address_visited
2322     Node *adr = m->in(AddPNode::Address);
2323 
2324     // Intel can handle 2 adds in addressing mode
2325     // AtomicAdd is not an addressing expression.
2326     // Cheap to find it by looking for screwy base.
2327     if (adr->is_AddP() &&
2328         !adr->in(AddPNode::Base)->is_top() &&
2329         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
2330         // Are there other uses besides address expressions?
2331         !is_visited(adr)) {
2332       address_visited.set(adr->_idx); // Flag as address_visited
2333       Node *shift = adr->in(AddPNode::Offset);
2334       if (!clone_shift(shift, this, mstack, address_visited)) {
2335         mstack.push(shift, Pre_Visit);
2336       }
2337       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
2338       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
2339     } else {
2340       mstack.push(adr, Pre_Visit);
2341     }
2342 
2343     // Clone X+offset as it also folds into most addressing expressions
2344     mstack.push(off, Visit);
2345     mstack.push(m->in(AddPNode::Base), Pre_Visit);
2346     return true;
2347   } else if (clone_shift(off, this, mstack, address_visited)) {
2348     address_visited.test_set(m->_idx); // Flag as address_visited
2349     mstack.push(m->in(AddPNode::Address), Pre_Visit);
2350     mstack.push(m->in(AddPNode::Base), Pre_Visit);
2351     return true;
2352   }
2353   return false;
2354 }
2355 
2356 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
2357   switch (bt) {
2358     case BoolTest::eq:
2359       return Assembler::eq;
2360     case BoolTest::ne:
2361       return Assembler::neq;
2362     case BoolTest::le:
2363     case BoolTest::ule:
2364       return Assembler::le;
2365     case BoolTest::ge:
2366     case BoolTest::uge:
2367       return Assembler::nlt;
2368     case BoolTest::lt:
2369     case BoolTest::ult:
2370       return Assembler::lt;
2371     case BoolTest::gt:
2372     case BoolTest::ugt:
2373       return Assembler::nle;
2374     default : ShouldNotReachHere(); return Assembler::_false;
2375   }
2376 }
2377 
2378 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
2379   switch (bt) {
2380   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
2381   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
2382   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
2383   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
2384   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
2385   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
2386   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
2387   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
2388   }
2389 }
2390 
2391 // Helper methods for MachSpillCopyNode::implementation().
2392 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
2393                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
2394   assert(ireg == Op_VecS || // 32bit vector
2395          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
2396          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
2397          "no non-adjacent vector moves" );
2398   if (cbuf) {
2399     C2_MacroAssembler _masm(cbuf);
2400     switch (ireg) {
2401     case Op_VecS: // copy whole register
2402     case Op_VecD:
2403     case Op_VecX:
2404 #ifndef _LP64
2405       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2406 #else
2407       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2408         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2409       } else {
2410         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2411      }
2412 #endif
2413       break;
2414     case Op_VecY:
2415 #ifndef _LP64
2416       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2417 #else
2418       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2419         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2420       } else {
2421         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2422      }
2423 #endif
2424       break;
2425     case Op_VecZ:
2426       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
2427       break;
2428     default:
2429       ShouldNotReachHere();
2430     }
2431 #ifndef PRODUCT
2432   } else {
2433     switch (ireg) {
2434     case Op_VecS:
2435     case Op_VecD:
2436     case Op_VecX:
2437       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2438       break;
2439     case Op_VecY:
2440     case Op_VecZ:
2441       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2442       break;
2443     default:
2444       ShouldNotReachHere();
2445     }
2446 #endif
2447   }
2448 }
2449 
2450 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
2451                      int stack_offset, int reg, uint ireg, outputStream* st) {
2452   if (cbuf) {
2453     C2_MacroAssembler _masm(cbuf);
2454     if (is_load) {
2455       switch (ireg) {
2456       case Op_VecS:
2457         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2458         break;
2459       case Op_VecD:
2460         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2461         break;
2462       case Op_VecX:
2463 #ifndef _LP64
2464         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2465 #else
2466         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2467           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2468         } else {
2469           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2470           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2471         }
2472 #endif
2473         break;
2474       case Op_VecY:
2475 #ifndef _LP64
2476         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2477 #else
2478         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2479           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2480         } else {
2481           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2482           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2483         }
2484 #endif
2485         break;
2486       case Op_VecZ:
2487         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
2488         break;
2489       default:
2490         ShouldNotReachHere();
2491       }
2492     } else { // store
2493       switch (ireg) {
2494       case Op_VecS:
2495         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2496         break;
2497       case Op_VecD:
2498         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2499         break;
2500       case Op_VecX:
2501 #ifndef _LP64
2502         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2503 #else
2504         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2505           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2506         }
2507         else {
2508           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2509         }
2510 #endif
2511         break;
2512       case Op_VecY:
2513 #ifndef _LP64
2514         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2515 #else
2516         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2517           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2518         }
2519         else {
2520           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2521         }
2522 #endif
2523         break;
2524       case Op_VecZ:
2525         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2526         break;
2527       default:
2528         ShouldNotReachHere();
2529       }
2530     }
2531 #ifndef PRODUCT
2532   } else {
2533     if (is_load) {
2534       switch (ireg) {
2535       case Op_VecS:
2536         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2537         break;
2538       case Op_VecD:
2539         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2540         break;
2541        case Op_VecX:
2542         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2543         break;
2544       case Op_VecY:
2545       case Op_VecZ:
2546         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2547         break;
2548       default:
2549         ShouldNotReachHere();
2550       }
2551     } else { // store
2552       switch (ireg) {
2553       case Op_VecS:
2554         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2555         break;
2556       case Op_VecD:
2557         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2558         break;
2559        case Op_VecX:
2560         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2561         break;
2562       case Op_VecY:
2563       case Op_VecZ:
2564         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2565         break;
2566       default:
2567         ShouldNotReachHere();
2568       }
2569     }
2570 #endif
2571   }
2572 }
2573 
2574 template <class T>
2575 static inline GrowableArray<jvalue>* vreplicate_imm(BasicType bt, T con, int len) {
2576   GrowableArray<jvalue>* val = new GrowableArray<jvalue>(len);
2577   jvalue ele;
2578   switch (bt) {
2579     case T_BYTE:   ele.b = con; break;
2580     case T_SHORT:  ele.s = con; break;
2581     case T_INT:    ele.i = con; break;
2582     case T_LONG:   ele.j = con; break;
2583     case T_FLOAT:  ele.f = con; break;
2584     case T_DOUBLE: ele.d = con; break;
2585     default: ShouldNotReachHere();
2586   }
2587   for (int i = 0; i < len; i++) {
2588     val->append(ele);
2589   }
2590   return val;
2591 }
2592 
2593 static inline jlong high_bit_set(BasicType bt) {
2594   switch (bt) {
2595     case T_BYTE:  return 0x8080808080808080;
2596     case T_SHORT: return 0x8000800080008000;
2597     case T_INT:   return 0x8000000080000000;
2598     case T_LONG:  return 0x8000000000000000;
2599     default:
2600       ShouldNotReachHere();
2601       return 0;
2602   }
2603 }
2604 
2605 #ifndef PRODUCT
2606   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
2607     st->print("nop \t# %d bytes pad for loops and calls", _count);
2608   }
2609 #endif
2610 
2611   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
2612     C2_MacroAssembler _masm(&cbuf);
2613     __ nop(_count);
2614   }
2615 
2616   uint MachNopNode::size(PhaseRegAlloc*) const {
2617     return _count;
2618   }
2619 
2620 #ifndef PRODUCT
2621   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2622     st->print("# breakpoint");
2623   }
2624 #endif
2625 
2626   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
2627     C2_MacroAssembler _masm(&cbuf);
2628     __ int3();
2629   }
2630 
2631   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2632     return MachNode::size(ra_);
2633   }
2634 
2635 %}
2636 
2637 encode %{
2638 
2639   enc_class call_epilog %{
2640     if (VerifyStackAtCalls) {
2641       // Check that stack depth is unchanged: find majik cookie on stack
2642       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2643       C2_MacroAssembler _masm(&cbuf);
2644       Label L;
2645       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2646       __ jccb(Assembler::equal, L);
2647       // Die if stack mismatch
2648       __ int3();
2649       __ bind(L);
2650     }
2651     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic()) {
2652       C2_MacroAssembler _masm(&cbuf);
2653       if (!_method->signature()->returns_null_free_inline_type()) {
2654         // The last return value is not set by the callee but used to pass IsInit information to compiled code.
2655         // Search for the corresponding projection, get the register and emit code that initialized it.
2656         uint con = (tf()->range_cc()->cnt() - 1);
2657         for (DUIterator_Fast imax, i = fast_outs(imax); i < imax; i++) {
2658           ProjNode* proj = fast_out(i)->as_Proj();
2659           if (proj->_con == con) {
2660             // Set IsInit if rax is non-null (a non-null value is returned buffered or scalarized)
2661             OptoReg::Name optoReg = ra_->get_reg_first(proj);
2662             VMReg reg = OptoReg::as_VMReg(optoReg, ra_->_framesize, OptoReg::reg2stack(ra_->_matcher._new_SP));
2663             Register toReg = reg->is_reg() ? reg->as_Register() : rscratch1;
2664             __ testq(rax, rax);
2665             __ set_byte_if_not_zero(toReg);
2666             __ movzbl(toReg, toReg);
2667             if (reg->is_stack()) {
2668               int st_off = reg->reg2stack() * VMRegImpl::stack_slot_size;
2669               __ movq(Address(rsp, st_off), toReg);
2670             }
2671             break;
2672           }
2673         }
2674       }
2675       if (return_value_is_used()) {
2676         // An inline type is returned as fields in multiple registers.
2677         // Rax either contains an oop if the inline type is buffered or a pointer
2678         // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
2679         // if the lowest bit is set to allow C2 to use the oop after null checking.
2680         // rax &= (rax & 1) - 1
2681         __ movptr(rscratch1, rax);
2682         __ andptr(rscratch1, 0x1);
2683         __ subptr(rscratch1, 0x1);
2684         __ andptr(rax, rscratch1);
2685       }
2686     }
2687   %}
2688 
2689 %}
2690 
2691 // Operands for bound floating pointer register arguments
2692 operand rxmm0() %{
2693   constraint(ALLOC_IN_RC(xmm0_reg));
2694   match(VecX);
2695   format%{%}
2696   interface(REG_INTER);
2697 %}
2698 
2699 //----------OPERANDS-----------------------------------------------------------
2700 // Operand definitions must precede instruction definitions for correct parsing
2701 // in the ADLC because operands constitute user defined types which are used in
2702 // instruction definitions.
2703 
2704 // Vectors
2705 
2706 // Dummy generic vector class. Should be used for all vector operands.
2707 // Replaced with vec[SDXYZ] during post-selection pass.
2708 operand vec() %{
2709   constraint(ALLOC_IN_RC(dynamic));
2710   match(VecX);
2711   match(VecY);
2712   match(VecZ);
2713   match(VecS);
2714   match(VecD);
2715 
2716   format %{ %}
2717   interface(REG_INTER);
2718 %}
2719 
2720 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
2721 // Replaced with legVec[SDXYZ] during post-selection cleanup.
2722 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
2723 // runtime code generation via reg_class_dynamic.
2724 operand legVec() %{
2725   constraint(ALLOC_IN_RC(dynamic));
2726   match(VecX);
2727   match(VecY);
2728   match(VecZ);
2729   match(VecS);
2730   match(VecD);
2731 
2732   format %{ %}
2733   interface(REG_INTER);
2734 %}
2735 
2736 // Replaces vec during post-selection cleanup. See above.
2737 operand vecS() %{
2738   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
2739   match(VecS);
2740 
2741   format %{ %}
2742   interface(REG_INTER);
2743 %}
2744 
2745 // Replaces legVec during post-selection cleanup. See above.
2746 operand legVecS() %{
2747   constraint(ALLOC_IN_RC(vectors_reg_legacy));
2748   match(VecS);
2749 
2750   format %{ %}
2751   interface(REG_INTER);
2752 %}
2753 
2754 // Replaces vec during post-selection cleanup. See above.
2755 operand vecD() %{
2756   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
2757   match(VecD);
2758 
2759   format %{ %}
2760   interface(REG_INTER);
2761 %}
2762 
2763 // Replaces legVec during post-selection cleanup. See above.
2764 operand legVecD() %{
2765   constraint(ALLOC_IN_RC(vectord_reg_legacy));
2766   match(VecD);
2767 
2768   format %{ %}
2769   interface(REG_INTER);
2770 %}
2771 
2772 // Replaces vec during post-selection cleanup. See above.
2773 operand vecX() %{
2774   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
2775   match(VecX);
2776 
2777   format %{ %}
2778   interface(REG_INTER);
2779 %}
2780 
2781 // Replaces legVec during post-selection cleanup. See above.
2782 operand legVecX() %{
2783   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
2784   match(VecX);
2785 
2786   format %{ %}
2787   interface(REG_INTER);
2788 %}
2789 
2790 // Replaces vec during post-selection cleanup. See above.
2791 operand vecY() %{
2792   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
2793   match(VecY);
2794 
2795   format %{ %}
2796   interface(REG_INTER);
2797 %}
2798 
2799 // Replaces legVec during post-selection cleanup. See above.
2800 operand legVecY() %{
2801   constraint(ALLOC_IN_RC(vectory_reg_legacy));
2802   match(VecY);
2803 
2804   format %{ %}
2805   interface(REG_INTER);
2806 %}
2807 
2808 // Replaces vec during post-selection cleanup. See above.
2809 operand vecZ() %{
2810   constraint(ALLOC_IN_RC(vectorz_reg));
2811   match(VecZ);
2812 
2813   format %{ %}
2814   interface(REG_INTER);
2815 %}
2816 
2817 // Replaces legVec during post-selection cleanup. See above.
2818 operand legVecZ() %{
2819   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
2820   match(VecZ);
2821 
2822   format %{ %}
2823   interface(REG_INTER);
2824 %}
2825 
2826 // Comparison Code for FP conditional move
2827 operand cmpOp_vcmppd() %{
2828   match(Bool);
2829 
2830   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2831             n->as_Bool()->_test._test != BoolTest::no_overflow);
2832   format %{ "" %}
2833   interface(COND_INTER) %{
2834     equal        (0x0, "eq");
2835     less         (0x1, "lt");
2836     less_equal   (0x2, "le");
2837     not_equal    (0xC, "ne");
2838     greater_equal(0xD, "ge");
2839     greater      (0xE, "gt");
2840     //TODO cannot compile (adlc breaks) without two next lines with error:
2841     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2842     // equal' for overflow.
2843     overflow     (0x20, "o");  // not really supported by the instruction
2844     no_overflow  (0x21, "no"); // not really supported by the instruction
2845   %}
2846 %}
2847 
2848 
2849 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2850 
2851 // ============================================================================
2852 
2853 instruct ShouldNotReachHere() %{
2854   match(Halt);
2855   format %{ "stop\t# ShouldNotReachHere" %}
2856   ins_encode %{
2857     if (is_reachable()) {
2858       __ stop(_halt_reason);
2859     }
2860   %}
2861   ins_pipe(pipe_slow);
2862 %}
2863 
2864 // =================================EVEX special===============================
2865 // Existing partial implementation for post-loop multi-versioning computes
2866 // the mask corresponding to tail loop in K1 opmask register. This may then be
2867 // used for predicating instructions in loop body during last post-loop iteration.
2868 // TODO: Remove hard-coded K1 usage while fixing existing post-loop
2869 // multiversioning support.
2870 instruct setMask(rRegI dst, rRegI src, kReg_K1 mask) %{
2871   predicate(PostLoopMultiversioning && Matcher::has_predicated_vectors());
2872   match(Set dst (SetVectMaskI  src));
2873   effect(TEMP dst);
2874   format %{ "setvectmask   $dst, $src" %}
2875   ins_encode %{
2876     __ setvectmask($dst$$Register, $src$$Register, $mask$$KRegister);
2877   %}
2878   ins_pipe(pipe_slow);
2879 %}
2880 
2881 // ============================================================================
2882 
2883 instruct addF_reg(regF dst, regF src) %{
2884   predicate((UseSSE>=1) && (UseAVX == 0));
2885   match(Set dst (AddF dst src));
2886 
2887   format %{ "addss   $dst, $src" %}
2888   ins_cost(150);
2889   ins_encode %{
2890     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2891   %}
2892   ins_pipe(pipe_slow);
2893 %}
2894 
2895 instruct addF_mem(regF dst, memory src) %{
2896   predicate((UseSSE>=1) && (UseAVX == 0));
2897   match(Set dst (AddF dst (LoadF src)));
2898 
2899   format %{ "addss   $dst, $src" %}
2900   ins_cost(150);
2901   ins_encode %{
2902     __ addss($dst$$XMMRegister, $src$$Address);
2903   %}
2904   ins_pipe(pipe_slow);
2905 %}
2906 
2907 instruct addF_imm(regF dst, immF con) %{
2908   predicate((UseSSE>=1) && (UseAVX == 0));
2909   match(Set dst (AddF dst con));
2910   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2911   ins_cost(150);
2912   ins_encode %{
2913     __ addss($dst$$XMMRegister, $constantaddress($con));
2914   %}
2915   ins_pipe(pipe_slow);
2916 %}
2917 
2918 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2919   predicate(UseAVX > 0);
2920   match(Set dst (AddF src1 src2));
2921 
2922   format %{ "vaddss  $dst, $src1, $src2" %}
2923   ins_cost(150);
2924   ins_encode %{
2925     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2926   %}
2927   ins_pipe(pipe_slow);
2928 %}
2929 
2930 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2931   predicate(UseAVX > 0);
2932   match(Set dst (AddF src1 (LoadF src2)));
2933 
2934   format %{ "vaddss  $dst, $src1, $src2" %}
2935   ins_cost(150);
2936   ins_encode %{
2937     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2938   %}
2939   ins_pipe(pipe_slow);
2940 %}
2941 
2942 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2943   predicate(UseAVX > 0);
2944   match(Set dst (AddF src con));
2945 
2946   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2947   ins_cost(150);
2948   ins_encode %{
2949     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2950   %}
2951   ins_pipe(pipe_slow);
2952 %}
2953 
2954 instruct addD_reg(regD dst, regD src) %{
2955   predicate((UseSSE>=2) && (UseAVX == 0));
2956   match(Set dst (AddD dst src));
2957 
2958   format %{ "addsd   $dst, $src" %}
2959   ins_cost(150);
2960   ins_encode %{
2961     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2962   %}
2963   ins_pipe(pipe_slow);
2964 %}
2965 
2966 instruct addD_mem(regD dst, memory src) %{
2967   predicate((UseSSE>=2) && (UseAVX == 0));
2968   match(Set dst (AddD dst (LoadD src)));
2969 
2970   format %{ "addsd   $dst, $src" %}
2971   ins_cost(150);
2972   ins_encode %{
2973     __ addsd($dst$$XMMRegister, $src$$Address);
2974   %}
2975   ins_pipe(pipe_slow);
2976 %}
2977 
2978 instruct addD_imm(regD dst, immD con) %{
2979   predicate((UseSSE>=2) && (UseAVX == 0));
2980   match(Set dst (AddD dst con));
2981   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2982   ins_cost(150);
2983   ins_encode %{
2984     __ addsd($dst$$XMMRegister, $constantaddress($con));
2985   %}
2986   ins_pipe(pipe_slow);
2987 %}
2988 
2989 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2990   predicate(UseAVX > 0);
2991   match(Set dst (AddD src1 src2));
2992 
2993   format %{ "vaddsd  $dst, $src1, $src2" %}
2994   ins_cost(150);
2995   ins_encode %{
2996     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2997   %}
2998   ins_pipe(pipe_slow);
2999 %}
3000 
3001 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
3002   predicate(UseAVX > 0);
3003   match(Set dst (AddD src1 (LoadD src2)));
3004 
3005   format %{ "vaddsd  $dst, $src1, $src2" %}
3006   ins_cost(150);
3007   ins_encode %{
3008     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3009   %}
3010   ins_pipe(pipe_slow);
3011 %}
3012 
3013 instruct addD_reg_imm(regD dst, regD src, immD con) %{
3014   predicate(UseAVX > 0);
3015   match(Set dst (AddD src con));
3016 
3017   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3018   ins_cost(150);
3019   ins_encode %{
3020     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3021   %}
3022   ins_pipe(pipe_slow);
3023 %}
3024 
3025 instruct subF_reg(regF dst, regF src) %{
3026   predicate((UseSSE>=1) && (UseAVX == 0));
3027   match(Set dst (SubF dst src));
3028 
3029   format %{ "subss   $dst, $src" %}
3030   ins_cost(150);
3031   ins_encode %{
3032     __ subss($dst$$XMMRegister, $src$$XMMRegister);
3033   %}
3034   ins_pipe(pipe_slow);
3035 %}
3036 
3037 instruct subF_mem(regF dst, memory src) %{
3038   predicate((UseSSE>=1) && (UseAVX == 0));
3039   match(Set dst (SubF dst (LoadF src)));
3040 
3041   format %{ "subss   $dst, $src" %}
3042   ins_cost(150);
3043   ins_encode %{
3044     __ subss($dst$$XMMRegister, $src$$Address);
3045   %}
3046   ins_pipe(pipe_slow);
3047 %}
3048 
3049 instruct subF_imm(regF dst, immF con) %{
3050   predicate((UseSSE>=1) && (UseAVX == 0));
3051   match(Set dst (SubF dst con));
3052   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3053   ins_cost(150);
3054   ins_encode %{
3055     __ subss($dst$$XMMRegister, $constantaddress($con));
3056   %}
3057   ins_pipe(pipe_slow);
3058 %}
3059 
3060 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
3061   predicate(UseAVX > 0);
3062   match(Set dst (SubF src1 src2));
3063 
3064   format %{ "vsubss  $dst, $src1, $src2" %}
3065   ins_cost(150);
3066   ins_encode %{
3067     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3068   %}
3069   ins_pipe(pipe_slow);
3070 %}
3071 
3072 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
3073   predicate(UseAVX > 0);
3074   match(Set dst (SubF src1 (LoadF src2)));
3075 
3076   format %{ "vsubss  $dst, $src1, $src2" %}
3077   ins_cost(150);
3078   ins_encode %{
3079     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3080   %}
3081   ins_pipe(pipe_slow);
3082 %}
3083 
3084 instruct subF_reg_imm(regF dst, regF src, immF con) %{
3085   predicate(UseAVX > 0);
3086   match(Set dst (SubF src con));
3087 
3088   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3089   ins_cost(150);
3090   ins_encode %{
3091     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3092   %}
3093   ins_pipe(pipe_slow);
3094 %}
3095 
3096 instruct subD_reg(regD dst, regD src) %{
3097   predicate((UseSSE>=2) && (UseAVX == 0));
3098   match(Set dst (SubD dst src));
3099 
3100   format %{ "subsd   $dst, $src" %}
3101   ins_cost(150);
3102   ins_encode %{
3103     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
3104   %}
3105   ins_pipe(pipe_slow);
3106 %}
3107 
3108 instruct subD_mem(regD dst, memory src) %{
3109   predicate((UseSSE>=2) && (UseAVX == 0));
3110   match(Set dst (SubD dst (LoadD src)));
3111 
3112   format %{ "subsd   $dst, $src" %}
3113   ins_cost(150);
3114   ins_encode %{
3115     __ subsd($dst$$XMMRegister, $src$$Address);
3116   %}
3117   ins_pipe(pipe_slow);
3118 %}
3119 
3120 instruct subD_imm(regD dst, immD con) %{
3121   predicate((UseSSE>=2) && (UseAVX == 0));
3122   match(Set dst (SubD dst con));
3123   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3124   ins_cost(150);
3125   ins_encode %{
3126     __ subsd($dst$$XMMRegister, $constantaddress($con));
3127   %}
3128   ins_pipe(pipe_slow);
3129 %}
3130 
3131 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
3132   predicate(UseAVX > 0);
3133   match(Set dst (SubD src1 src2));
3134 
3135   format %{ "vsubsd  $dst, $src1, $src2" %}
3136   ins_cost(150);
3137   ins_encode %{
3138     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3139   %}
3140   ins_pipe(pipe_slow);
3141 %}
3142 
3143 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
3144   predicate(UseAVX > 0);
3145   match(Set dst (SubD src1 (LoadD src2)));
3146 
3147   format %{ "vsubsd  $dst, $src1, $src2" %}
3148   ins_cost(150);
3149   ins_encode %{
3150     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3151   %}
3152   ins_pipe(pipe_slow);
3153 %}
3154 
3155 instruct subD_reg_imm(regD dst, regD src, immD con) %{
3156   predicate(UseAVX > 0);
3157   match(Set dst (SubD src con));
3158 
3159   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3160   ins_cost(150);
3161   ins_encode %{
3162     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3163   %}
3164   ins_pipe(pipe_slow);
3165 %}
3166 
3167 instruct mulF_reg(regF dst, regF src) %{
3168   predicate((UseSSE>=1) && (UseAVX == 0));
3169   match(Set dst (MulF dst src));
3170 
3171   format %{ "mulss   $dst, $src" %}
3172   ins_cost(150);
3173   ins_encode %{
3174     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
3175   %}
3176   ins_pipe(pipe_slow);
3177 %}
3178 
3179 instruct mulF_mem(regF dst, memory src) %{
3180   predicate((UseSSE>=1) && (UseAVX == 0));
3181   match(Set dst (MulF dst (LoadF src)));
3182 
3183   format %{ "mulss   $dst, $src" %}
3184   ins_cost(150);
3185   ins_encode %{
3186     __ mulss($dst$$XMMRegister, $src$$Address);
3187   %}
3188   ins_pipe(pipe_slow);
3189 %}
3190 
3191 instruct mulF_imm(regF dst, immF con) %{
3192   predicate((UseSSE>=1) && (UseAVX == 0));
3193   match(Set dst (MulF dst con));
3194   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3195   ins_cost(150);
3196   ins_encode %{
3197     __ mulss($dst$$XMMRegister, $constantaddress($con));
3198   %}
3199   ins_pipe(pipe_slow);
3200 %}
3201 
3202 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
3203   predicate(UseAVX > 0);
3204   match(Set dst (MulF src1 src2));
3205 
3206   format %{ "vmulss  $dst, $src1, $src2" %}
3207   ins_cost(150);
3208   ins_encode %{
3209     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3210   %}
3211   ins_pipe(pipe_slow);
3212 %}
3213 
3214 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
3215   predicate(UseAVX > 0);
3216   match(Set dst (MulF src1 (LoadF src2)));
3217 
3218   format %{ "vmulss  $dst, $src1, $src2" %}
3219   ins_cost(150);
3220   ins_encode %{
3221     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3222   %}
3223   ins_pipe(pipe_slow);
3224 %}
3225 
3226 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
3227   predicate(UseAVX > 0);
3228   match(Set dst (MulF src con));
3229 
3230   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3231   ins_cost(150);
3232   ins_encode %{
3233     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3234   %}
3235   ins_pipe(pipe_slow);
3236 %}
3237 
3238 instruct mulD_reg(regD dst, regD src) %{
3239   predicate((UseSSE>=2) && (UseAVX == 0));
3240   match(Set dst (MulD dst src));
3241 
3242   format %{ "mulsd   $dst, $src" %}
3243   ins_cost(150);
3244   ins_encode %{
3245     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
3246   %}
3247   ins_pipe(pipe_slow);
3248 %}
3249 
3250 instruct mulD_mem(regD dst, memory src) %{
3251   predicate((UseSSE>=2) && (UseAVX == 0));
3252   match(Set dst (MulD dst (LoadD src)));
3253 
3254   format %{ "mulsd   $dst, $src" %}
3255   ins_cost(150);
3256   ins_encode %{
3257     __ mulsd($dst$$XMMRegister, $src$$Address);
3258   %}
3259   ins_pipe(pipe_slow);
3260 %}
3261 
3262 instruct mulD_imm(regD dst, immD con) %{
3263   predicate((UseSSE>=2) && (UseAVX == 0));
3264   match(Set dst (MulD dst con));
3265   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3266   ins_cost(150);
3267   ins_encode %{
3268     __ mulsd($dst$$XMMRegister, $constantaddress($con));
3269   %}
3270   ins_pipe(pipe_slow);
3271 %}
3272 
3273 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
3274   predicate(UseAVX > 0);
3275   match(Set dst (MulD src1 src2));
3276 
3277   format %{ "vmulsd  $dst, $src1, $src2" %}
3278   ins_cost(150);
3279   ins_encode %{
3280     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3281   %}
3282   ins_pipe(pipe_slow);
3283 %}
3284 
3285 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
3286   predicate(UseAVX > 0);
3287   match(Set dst (MulD src1 (LoadD src2)));
3288 
3289   format %{ "vmulsd  $dst, $src1, $src2" %}
3290   ins_cost(150);
3291   ins_encode %{
3292     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3293   %}
3294   ins_pipe(pipe_slow);
3295 %}
3296 
3297 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
3298   predicate(UseAVX > 0);
3299   match(Set dst (MulD src con));
3300 
3301   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3302   ins_cost(150);
3303   ins_encode %{
3304     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3305   %}
3306   ins_pipe(pipe_slow);
3307 %}
3308 
3309 instruct divF_reg(regF dst, regF src) %{
3310   predicate((UseSSE>=1) && (UseAVX == 0));
3311   match(Set dst (DivF dst src));
3312 
3313   format %{ "divss   $dst, $src" %}
3314   ins_cost(150);
3315   ins_encode %{
3316     __ divss($dst$$XMMRegister, $src$$XMMRegister);
3317   %}
3318   ins_pipe(pipe_slow);
3319 %}
3320 
3321 instruct divF_mem(regF dst, memory src) %{
3322   predicate((UseSSE>=1) && (UseAVX == 0));
3323   match(Set dst (DivF dst (LoadF src)));
3324 
3325   format %{ "divss   $dst, $src" %}
3326   ins_cost(150);
3327   ins_encode %{
3328     __ divss($dst$$XMMRegister, $src$$Address);
3329   %}
3330   ins_pipe(pipe_slow);
3331 %}
3332 
3333 instruct divF_imm(regF dst, immF con) %{
3334   predicate((UseSSE>=1) && (UseAVX == 0));
3335   match(Set dst (DivF dst con));
3336   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3337   ins_cost(150);
3338   ins_encode %{
3339     __ divss($dst$$XMMRegister, $constantaddress($con));
3340   %}
3341   ins_pipe(pipe_slow);
3342 %}
3343 
3344 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
3345   predicate(UseAVX > 0);
3346   match(Set dst (DivF src1 src2));
3347 
3348   format %{ "vdivss  $dst, $src1, $src2" %}
3349   ins_cost(150);
3350   ins_encode %{
3351     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3352   %}
3353   ins_pipe(pipe_slow);
3354 %}
3355 
3356 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
3357   predicate(UseAVX > 0);
3358   match(Set dst (DivF src1 (LoadF src2)));
3359 
3360   format %{ "vdivss  $dst, $src1, $src2" %}
3361   ins_cost(150);
3362   ins_encode %{
3363     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3364   %}
3365   ins_pipe(pipe_slow);
3366 %}
3367 
3368 instruct divF_reg_imm(regF dst, regF src, immF con) %{
3369   predicate(UseAVX > 0);
3370   match(Set dst (DivF src con));
3371 
3372   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3373   ins_cost(150);
3374   ins_encode %{
3375     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3376   %}
3377   ins_pipe(pipe_slow);
3378 %}
3379 
3380 instruct divD_reg(regD dst, regD src) %{
3381   predicate((UseSSE>=2) && (UseAVX == 0));
3382   match(Set dst (DivD dst src));
3383 
3384   format %{ "divsd   $dst, $src" %}
3385   ins_cost(150);
3386   ins_encode %{
3387     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
3388   %}
3389   ins_pipe(pipe_slow);
3390 %}
3391 
3392 instruct divD_mem(regD dst, memory src) %{
3393   predicate((UseSSE>=2) && (UseAVX == 0));
3394   match(Set dst (DivD dst (LoadD src)));
3395 
3396   format %{ "divsd   $dst, $src" %}
3397   ins_cost(150);
3398   ins_encode %{
3399     __ divsd($dst$$XMMRegister, $src$$Address);
3400   %}
3401   ins_pipe(pipe_slow);
3402 %}
3403 
3404 instruct divD_imm(regD dst, immD con) %{
3405   predicate((UseSSE>=2) && (UseAVX == 0));
3406   match(Set dst (DivD dst con));
3407   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3408   ins_cost(150);
3409   ins_encode %{
3410     __ divsd($dst$$XMMRegister, $constantaddress($con));
3411   %}
3412   ins_pipe(pipe_slow);
3413 %}
3414 
3415 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
3416   predicate(UseAVX > 0);
3417   match(Set dst (DivD src1 src2));
3418 
3419   format %{ "vdivsd  $dst, $src1, $src2" %}
3420   ins_cost(150);
3421   ins_encode %{
3422     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3423   %}
3424   ins_pipe(pipe_slow);
3425 %}
3426 
3427 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
3428   predicate(UseAVX > 0);
3429   match(Set dst (DivD src1 (LoadD src2)));
3430 
3431   format %{ "vdivsd  $dst, $src1, $src2" %}
3432   ins_cost(150);
3433   ins_encode %{
3434     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3435   %}
3436   ins_pipe(pipe_slow);
3437 %}
3438 
3439 instruct divD_reg_imm(regD dst, regD src, immD con) %{
3440   predicate(UseAVX > 0);
3441   match(Set dst (DivD src con));
3442 
3443   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3444   ins_cost(150);
3445   ins_encode %{
3446     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3447   %}
3448   ins_pipe(pipe_slow);
3449 %}
3450 
3451 instruct absF_reg(regF dst) %{
3452   predicate((UseSSE>=1) && (UseAVX == 0));
3453   match(Set dst (AbsF dst));
3454   ins_cost(150);
3455   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
3456   ins_encode %{
3457     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
3458   %}
3459   ins_pipe(pipe_slow);
3460 %}
3461 
3462 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
3463   predicate(UseAVX > 0);
3464   match(Set dst (AbsF src));
3465   ins_cost(150);
3466   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
3467   ins_encode %{
3468     int vlen_enc = Assembler::AVX_128bit;
3469     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
3470               ExternalAddress(float_signmask()), vlen_enc);
3471   %}
3472   ins_pipe(pipe_slow);
3473 %}
3474 
3475 instruct absD_reg(regD dst) %{
3476   predicate((UseSSE>=2) && (UseAVX == 0));
3477   match(Set dst (AbsD dst));
3478   ins_cost(150);
3479   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
3480             "# abs double by sign masking" %}
3481   ins_encode %{
3482     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
3483   %}
3484   ins_pipe(pipe_slow);
3485 %}
3486 
3487 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
3488   predicate(UseAVX > 0);
3489   match(Set dst (AbsD src));
3490   ins_cost(150);
3491   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
3492             "# abs double by sign masking" %}
3493   ins_encode %{
3494     int vlen_enc = Assembler::AVX_128bit;
3495     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
3496               ExternalAddress(double_signmask()), vlen_enc);
3497   %}
3498   ins_pipe(pipe_slow);
3499 %}
3500 
3501 instruct negF_reg(regF dst) %{
3502   predicate((UseSSE>=1) && (UseAVX == 0));
3503   match(Set dst (NegF dst));
3504   ins_cost(150);
3505   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
3506   ins_encode %{
3507     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
3508   %}
3509   ins_pipe(pipe_slow);
3510 %}
3511 
3512 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
3513   predicate(UseAVX > 0);
3514   match(Set dst (NegF src));
3515   ins_cost(150);
3516   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
3517   ins_encode %{
3518     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
3519                  ExternalAddress(float_signflip()));
3520   %}
3521   ins_pipe(pipe_slow);
3522 %}
3523 
3524 instruct negD_reg(regD dst) %{
3525   predicate((UseSSE>=2) && (UseAVX == 0));
3526   match(Set dst (NegD dst));
3527   ins_cost(150);
3528   format %{ "xorpd   $dst, [0x8000000000000000]\t"
3529             "# neg double by sign flipping" %}
3530   ins_encode %{
3531     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
3532   %}
3533   ins_pipe(pipe_slow);
3534 %}
3535 
3536 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
3537   predicate(UseAVX > 0);
3538   match(Set dst (NegD src));
3539   ins_cost(150);
3540   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
3541             "# neg double by sign flipping" %}
3542   ins_encode %{
3543     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
3544                  ExternalAddress(double_signflip()));
3545   %}
3546   ins_pipe(pipe_slow);
3547 %}
3548 
3549 // sqrtss instruction needs destination register to be pre initialized for best performance
3550 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3551 instruct sqrtF_reg(regF dst) %{
3552   predicate(UseSSE>=1);
3553   match(Set dst (SqrtF dst));
3554   format %{ "sqrtss  $dst, $dst" %}
3555   ins_encode %{
3556     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
3557   %}
3558   ins_pipe(pipe_slow);
3559 %}
3560 
3561 // sqrtsd instruction needs destination register to be pre initialized for best performance
3562 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3563 instruct sqrtD_reg(regD dst) %{
3564   predicate(UseSSE>=2);
3565   match(Set dst (SqrtD dst));
3566   format %{ "sqrtsd  $dst, $dst" %}
3567   ins_encode %{
3568     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
3569   %}
3570   ins_pipe(pipe_slow);
3571 %}
3572 
3573 
3574 // ---------------------------------------- VectorReinterpret ------------------------------------
3575 instruct reinterpret_mask(kReg dst) %{
3576   predicate(n->bottom_type()->isa_vectmask() &&
3577             Matcher::vector_length(n) == Matcher::vector_length(n->in(1))); // dst == src
3578   match(Set dst (VectorReinterpret dst));
3579   ins_cost(125);
3580   format %{ "vector_reinterpret $dst\t!" %}
3581   ins_encode %{
3582     // empty
3583   %}
3584   ins_pipe( pipe_slow );
3585 %}
3586 
3587 instruct reinterpret_mask_W2B(kReg dst, kReg src, vec xtmp) %{
3588   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3589             n->bottom_type()->isa_vectmask() &&
3590             n->in(1)->bottom_type()->isa_vectmask() &&
3591             n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_SHORT &&
3592             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3593   match(Set dst (VectorReinterpret src));
3594   effect(TEMP xtmp);
3595   format %{ "vector_mask_reinterpret_W2B $dst $src\t!" %}
3596   ins_encode %{
3597      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_SHORT);
3598      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3599      assert(src_sz == dst_sz , "src and dst size mismatch");
3600      int vlen_enc = vector_length_encoding(src_sz);
3601      __  evpmovm2w($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3602      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3603   %}
3604   ins_pipe( pipe_slow );
3605 %}
3606 
3607 instruct reinterpret_mask_D2B(kReg dst, kReg src, vec xtmp) %{
3608   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3609             n->bottom_type()->isa_vectmask() &&
3610             n->in(1)->bottom_type()->isa_vectmask() &&
3611             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_INT ||
3612              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_FLOAT) &&
3613             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3614   match(Set dst (VectorReinterpret src));
3615   effect(TEMP xtmp);
3616   format %{ "vector_mask_reinterpret_D2B $dst $src\t!" %}
3617   ins_encode %{
3618      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_INT);
3619      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3620      assert(src_sz == dst_sz , "src and dst size mismatch");
3621      int vlen_enc = vector_length_encoding(src_sz);
3622      __  evpmovm2d($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3623      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3624   %}
3625   ins_pipe( pipe_slow );
3626 %}
3627 
3628 instruct reinterpret_mask_Q2B(kReg dst, kReg src, vec xtmp) %{
3629   predicate(UseAVX > 2 && Matcher::vector_length(n) != Matcher::vector_length(n->in(1)) &&
3630             n->bottom_type()->isa_vectmask() &&
3631             n->in(1)->bottom_type()->isa_vectmask() &&
3632             (n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_LONG ||
3633              n->in(1)->bottom_type()->is_vectmask()->element_basic_type() == T_DOUBLE) &&
3634             n->bottom_type()->is_vectmask()->element_basic_type() == T_BYTE); // dst == src
3635   match(Set dst (VectorReinterpret src));
3636   effect(TEMP xtmp);
3637   format %{ "vector_mask_reinterpret_Q2B $dst $src\t!" %}
3638   ins_encode %{
3639      int src_sz = Matcher::vector_length(this, $src)*type2aelembytes(T_LONG);
3640      int dst_sz = Matcher::vector_length(this)*type2aelembytes(T_BYTE);
3641      assert(src_sz == dst_sz , "src and dst size mismatch");
3642      int vlen_enc = vector_length_encoding(src_sz);
3643      __  evpmovm2q($xtmp$$XMMRegister, $src$$KRegister, vlen_enc);
3644      __  evpmovb2m($dst$$KRegister, $xtmp$$XMMRegister, vlen_enc);
3645   %}
3646   ins_pipe( pipe_slow );
3647 %}
3648 
3649 instruct reinterpret(vec dst) %{
3650   predicate(!n->bottom_type()->isa_vectmask() &&
3651             Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
3652   match(Set dst (VectorReinterpret dst));
3653   ins_cost(125);
3654   format %{ "vector_reinterpret $dst\t!" %}
3655   ins_encode %{
3656     // empty
3657   %}
3658   ins_pipe( pipe_slow );
3659 %}
3660 
3661 instruct reinterpret_expand(vec dst, vec src, rRegP scratch) %{
3662   predicate(UseAVX == 0 &&
3663             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3664   match(Set dst (VectorReinterpret src));
3665   ins_cost(125);
3666   effect(TEMP dst, TEMP scratch);
3667   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3668   ins_encode %{
3669     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
3670     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
3671 
3672     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
3673     if (src_vlen_in_bytes == 4) {
3674       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3675     } else {
3676       assert(src_vlen_in_bytes == 8, "");
3677       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), $scratch$$Register);
3678     }
3679     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3680   %}
3681   ins_pipe( pipe_slow );
3682 %}
3683 
3684 instruct vreinterpret_expand4(legVec dst, vec src, rRegP scratch) %{
3685   predicate(UseAVX > 0 &&
3686             !n->bottom_type()->isa_vectmask() &&
3687             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
3688             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3689   match(Set dst (VectorReinterpret src));
3690   ins_cost(125);
3691   effect(TEMP scratch);
3692   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3693   ins_encode %{
3694     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, $scratch$$Register);
3695   %}
3696   ins_pipe( pipe_slow );
3697 %}
3698 
3699 
3700 instruct vreinterpret_expand(legVec dst, vec src) %{
3701   predicate(UseAVX > 0 &&
3702             !n->bottom_type()->isa_vectmask() &&
3703             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
3704             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3705   match(Set dst (VectorReinterpret src));
3706   ins_cost(125);
3707   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
3708   ins_encode %{
3709     switch (Matcher::vector_length_in_bytes(this, $src)) {
3710       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3711       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3712       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3713       default: ShouldNotReachHere();
3714     }
3715   %}
3716   ins_pipe( pipe_slow );
3717 %}
3718 
3719 instruct reinterpret_shrink(vec dst, legVec src) %{
3720   predicate(!n->bottom_type()->isa_vectmask() &&
3721             Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
3722   match(Set dst (VectorReinterpret src));
3723   ins_cost(125);
3724   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
3725   ins_encode %{
3726     switch (Matcher::vector_length_in_bytes(this)) {
3727       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
3728       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3729       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3730       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3731       default: ShouldNotReachHere();
3732     }
3733   %}
3734   ins_pipe( pipe_slow );
3735 %}
3736 
3737 // ----------------------------------------------------------------------------------------------------
3738 
3739 #ifdef _LP64
3740 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
3741   match(Set dst (RoundDoubleMode src rmode));
3742   format %{ "roundsd $dst,$src" %}
3743   ins_cost(150);
3744   ins_encode %{
3745     assert(UseSSE >= 4, "required");
3746     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
3747   %}
3748   ins_pipe(pipe_slow);
3749 %}
3750 
3751 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
3752   match(Set dst (RoundDoubleMode (LoadD src) rmode));
3753   format %{ "roundsd $dst,$src" %}
3754   ins_cost(150);
3755   ins_encode %{
3756     assert(UseSSE >= 4, "required");
3757     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
3758   %}
3759   ins_pipe(pipe_slow);
3760 %}
3761 
3762 instruct roundD_imm(legRegD dst, immD con, immU8 rmode, rRegI scratch_reg) %{
3763   match(Set dst (RoundDoubleMode con rmode));
3764   effect(TEMP scratch_reg);
3765   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
3766   ins_cost(150);
3767   ins_encode %{
3768     assert(UseSSE >= 4, "required");
3769     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, $scratch_reg$$Register);
3770   %}
3771   ins_pipe(pipe_slow);
3772 %}
3773 
3774 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
3775   predicate(Matcher::vector_length(n) < 8);
3776   match(Set dst (RoundDoubleModeV src rmode));
3777   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
3778   ins_encode %{
3779     assert(UseAVX > 0, "required");
3780     int vlen_enc = vector_length_encoding(this);
3781     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
3782   %}
3783   ins_pipe( pipe_slow );
3784 %}
3785 
3786 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
3787   predicate(Matcher::vector_length(n) == 8);
3788   match(Set dst (RoundDoubleModeV src rmode));
3789   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
3790   ins_encode %{
3791     assert(UseAVX > 2, "required");
3792     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
3793   %}
3794   ins_pipe( pipe_slow );
3795 %}
3796 
3797 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
3798   predicate(Matcher::vector_length(n) < 8);
3799   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3800   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
3801   ins_encode %{
3802     assert(UseAVX > 0, "required");
3803     int vlen_enc = vector_length_encoding(this);
3804     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
3805   %}
3806   ins_pipe( pipe_slow );
3807 %}
3808 
3809 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
3810   predicate(Matcher::vector_length(n) == 8);
3811   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3812   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
3813   ins_encode %{
3814     assert(UseAVX > 2, "required");
3815     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
3816   %}
3817   ins_pipe( pipe_slow );
3818 %}
3819 #endif // _LP64
3820 
3821 instruct onspinwait() %{
3822   match(OnSpinWait);
3823   ins_cost(200);
3824 
3825   format %{
3826     $$template
3827     $$emit$$"pause\t! membar_onspinwait"
3828   %}
3829   ins_encode %{
3830     __ pause();
3831   %}
3832   ins_pipe(pipe_slow);
3833 %}
3834 
3835 // a * b + c
3836 instruct fmaD_reg(regD a, regD b, regD c) %{
3837   predicate(UseFMA);
3838   match(Set c (FmaD  c (Binary a b)));
3839   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
3840   ins_cost(150);
3841   ins_encode %{
3842     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3843   %}
3844   ins_pipe( pipe_slow );
3845 %}
3846 
3847 // a * b + c
3848 instruct fmaF_reg(regF a, regF b, regF c) %{
3849   predicate(UseFMA);
3850   match(Set c (FmaF  c (Binary a b)));
3851   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
3852   ins_cost(150);
3853   ins_encode %{
3854     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3855   %}
3856   ins_pipe( pipe_slow );
3857 %}
3858 
3859 // ====================VECTOR INSTRUCTIONS=====================================
3860 
3861 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
3862 instruct MoveVec2Leg(legVec dst, vec src) %{
3863   match(Set dst src);
3864   format %{ "" %}
3865   ins_encode %{
3866     ShouldNotReachHere();
3867   %}
3868   ins_pipe( fpu_reg_reg );
3869 %}
3870 
3871 instruct MoveLeg2Vec(vec dst, legVec src) %{
3872   match(Set dst src);
3873   format %{ "" %}
3874   ins_encode %{
3875     ShouldNotReachHere();
3876   %}
3877   ins_pipe( fpu_reg_reg );
3878 %}
3879 
3880 // ============================================================================
3881 
3882 // Load vectors generic operand pattern
3883 instruct loadV(vec dst, memory mem) %{
3884   match(Set dst (LoadVector mem));
3885   ins_cost(125);
3886   format %{ "load_vector $dst,$mem" %}
3887   ins_encode %{
3888     __ load_vector($dst$$XMMRegister, $mem$$Address, Matcher::vector_length_in_bytes(this));
3889   %}
3890   ins_pipe( pipe_slow );
3891 %}
3892 
3893 // Store vectors generic operand pattern.
3894 instruct storeV(memory mem, vec src) %{
3895   match(Set mem (StoreVector mem src));
3896   ins_cost(145);
3897   format %{ "store_vector $mem,$src\n\t" %}
3898   ins_encode %{
3899     switch (Matcher::vector_length_in_bytes(this, $src)) {
3900       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
3901       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
3902       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
3903       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
3904       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
3905       default: ShouldNotReachHere();
3906     }
3907   %}
3908   ins_pipe( pipe_slow );
3909 %}
3910 
3911 // ---------------------------------------- Gather ------------------------------------
3912 
3913 // Gather INT, LONG, FLOAT, DOUBLE
3914 
3915 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
3916   predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
3917   match(Set dst (LoadVectorGather mem idx));
3918   effect(TEMP dst, TEMP tmp, TEMP mask);
3919   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
3920   ins_encode %{
3921     assert(UseAVX >= 2, "sanity");
3922 
3923     int vlen_enc = vector_length_encoding(this);
3924     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3925 
3926     assert(Matcher::vector_length_in_bytes(this) >= 16, "sanity");
3927     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3928 
3929     if (vlen_enc == Assembler::AVX_128bit) {
3930       __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3931     } else {
3932       __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3933     }
3934     __ lea($tmp$$Register, $mem$$Address);
3935     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
3936   %}
3937   ins_pipe( pipe_slow );
3938 %}
3939 
3940 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
3941   predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
3942   match(Set dst (LoadVectorGather mem idx));
3943   effect(TEMP dst, TEMP tmp, TEMP ktmp);
3944   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and ktmp as TEMP" %}
3945   ins_encode %{
3946     assert(UseAVX > 2, "sanity");
3947 
3948     int vlen_enc = vector_length_encoding(this);
3949     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3950 
3951     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3952 
3953     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3954     __ lea($tmp$$Register, $mem$$Address);
3955     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
3956   %}
3957   ins_pipe( pipe_slow );
3958 %}
3959 
3960 instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
3961   match(Set dst (LoadVectorGatherMasked mem (Binary idx mask)));
3962   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp);
3963   format %{ "load_vector_gather_masked $dst, $mem, $idx, $mask\t! using $tmp and ktmp as TEMP" %}
3964   ins_encode %{
3965     assert(UseAVX > 2, "sanity");
3966     int vlen_enc = vector_length_encoding(this);
3967     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3968     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3969     // Note: Since gather instruction partially updates the opmask register used
3970     // for predication hense moving mask operand to a temporary.
3971     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
3972     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3973     __ lea($tmp$$Register, $mem$$Address);
3974     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
3975   %}
3976   ins_pipe( pipe_slow );
3977 %}
3978 // ====================Scatter=======================================
3979 
3980 // Scatter INT, LONG, FLOAT, DOUBLE
3981 
3982 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
3983   predicate(UseAVX > 2);
3984   match(Set mem (StoreVectorScatter mem (Binary src idx)));
3985   effect(TEMP tmp, TEMP ktmp);
3986   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
3987   ins_encode %{
3988     int vlen_enc = vector_length_encoding(this, $src);
3989     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
3990 
3991     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
3992     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3993 
3994     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3995     __ lea($tmp$$Register, $mem$$Address);
3996     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
3997   %}
3998   ins_pipe( pipe_slow );
3999 %}
4000 
4001 instruct scatter_masked(memory mem, vec src, vec idx, kReg mask, kReg ktmp, rRegP tmp) %{
4002   match(Set mem (StoreVectorScatterMasked mem (Binary src (Binary idx mask))));
4003   effect(TEMP tmp, TEMP ktmp);
4004   format %{ "store_vector_scatter_masked $mem, $idx, $src, $mask\t!" %}
4005   ins_encode %{
4006     int vlen_enc = vector_length_encoding(this, $src);
4007     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
4008     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
4009     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
4010     // Note: Since scatter instruction partially updates the opmask register used
4011     // for predication hense moving mask operand to a temporary.
4012     __ kmovwl($ktmp$$KRegister, $mask$$KRegister);
4013     __ lea($tmp$$Register, $mem$$Address);
4014     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
4015   %}
4016   ins_pipe( pipe_slow );
4017 %}
4018 
4019 // ====================REPLICATE=======================================
4020 
4021 // Replicate byte scalar to be vector
4022 instruct ReplB_reg(vec dst, rRegI src) %{
4023   match(Set dst (ReplicateB src));
4024   format %{ "replicateB $dst,$src" %}
4025   ins_encode %{
4026     uint vlen = Matcher::vector_length(this);
4027     if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
4028       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
4029       int vlen_enc = vector_length_encoding(this);
4030       __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
4031     } else if (VM_Version::supports_avx2()) {
4032       int vlen_enc = vector_length_encoding(this);
4033       __ movdl($dst$$XMMRegister, $src$$Register);
4034       __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4035     } else {
4036       __ movdl($dst$$XMMRegister, $src$$Register);
4037       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
4038       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4039       if (vlen >= 16) {
4040         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4041         if (vlen >= 32) {
4042           assert(vlen == 32, "sanity");
4043           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4044         }
4045       }
4046     }
4047   %}
4048   ins_pipe( pipe_slow );
4049 %}
4050 
4051 instruct ReplB_mem(vec dst, memory mem) %{
4052   predicate(VM_Version::supports_avx2());
4053   match(Set dst (ReplicateB (LoadB mem)));
4054   format %{ "replicateB $dst,$mem" %}
4055   ins_encode %{
4056     int vlen_enc = vector_length_encoding(this);
4057     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
4058   %}
4059   ins_pipe( pipe_slow );
4060 %}
4061 
4062 instruct ReplB_imm(vec dst, immI con) %{
4063   match(Set dst (ReplicateB con));
4064   format %{ "replicateB $dst,$con" %}
4065   ins_encode %{
4066     InternalAddress addr = $constantaddress(T_BYTE, vreplicate_imm(T_BYTE, $con$$constant, Matcher::vector_length(this)));
4067     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
4068   %}
4069   ins_pipe( pipe_slow );
4070 %}
4071 
4072 // ====================ReplicateS=======================================
4073 
4074 instruct ReplS_reg(vec dst, rRegI src) %{
4075   match(Set dst (ReplicateS src));
4076   format %{ "replicateS $dst,$src" %}
4077   ins_encode %{
4078     uint vlen = Matcher::vector_length(this);
4079     if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
4080       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
4081       int vlen_enc = vector_length_encoding(this);
4082       __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
4083     } else if (VM_Version::supports_avx2()) {
4084       int vlen_enc = vector_length_encoding(this);
4085       __ movdl($dst$$XMMRegister, $src$$Register);
4086       __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4087     } else {
4088       __ movdl($dst$$XMMRegister, $src$$Register);
4089       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4090       if (vlen >= 8) {
4091         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4092         if (vlen >= 16) {
4093           assert(vlen == 16, "sanity");
4094           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4095         }
4096       }
4097     }
4098   %}
4099   ins_pipe( pipe_slow );
4100 %}
4101 
4102 instruct ReplS_mem(vec dst, memory mem) %{
4103   predicate(VM_Version::supports_avx2());
4104   match(Set dst (ReplicateS (LoadS mem)));
4105   format %{ "replicateS $dst,$mem" %}
4106   ins_encode %{
4107     int vlen_enc = vector_length_encoding(this);
4108     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
4109   %}
4110   ins_pipe( pipe_slow );
4111 %}
4112 
4113 instruct ReplS_imm(vec dst, immI con) %{
4114   match(Set dst (ReplicateS con));
4115   format %{ "replicateS $dst,$con" %}
4116   ins_encode %{
4117     InternalAddress addr = $constantaddress(T_SHORT, vreplicate_imm(T_SHORT, $con$$constant, Matcher::vector_length(this)));
4118     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
4119   %}
4120   ins_pipe( pipe_slow );
4121 %}
4122 
4123 // ====================ReplicateI=======================================
4124 
4125 instruct ReplI_reg(vec dst, rRegI src) %{
4126   match(Set dst (ReplicateI src));
4127   format %{ "replicateI $dst,$src" %}
4128   ins_encode %{
4129     uint vlen = Matcher::vector_length(this);
4130     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4131       int vlen_enc = vector_length_encoding(this);
4132       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
4133     } else if (VM_Version::supports_avx2()) {
4134       int vlen_enc = vector_length_encoding(this);
4135       __ movdl($dst$$XMMRegister, $src$$Register);
4136       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4137     } else {
4138       __ movdl($dst$$XMMRegister, $src$$Register);
4139       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4140       if (vlen >= 8) {
4141         assert(vlen == 8, "sanity");
4142         __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4143       }
4144     }
4145   %}
4146   ins_pipe( pipe_slow );
4147 %}
4148 
4149 instruct ReplI_mem(vec dst, memory mem) %{
4150   match(Set dst (ReplicateI (LoadI mem)));
4151   format %{ "replicateI $dst,$mem" %}
4152   ins_encode %{
4153     uint vlen = Matcher::vector_length(this);
4154     if (vlen <= 4) {
4155       __ movdl($dst$$XMMRegister, $mem$$Address);
4156       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4157     } else {
4158       assert(VM_Version::supports_avx2(), "sanity");
4159       int vlen_enc = vector_length_encoding(this);
4160       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4161     }
4162   %}
4163   ins_pipe( pipe_slow );
4164 %}
4165 
4166 instruct ReplI_imm(vec dst, immI con) %{
4167   match(Set dst (ReplicateI con));
4168   format %{ "replicateI $dst,$con" %}
4169   ins_encode %{
4170     InternalAddress addr = $constantaddress(T_INT, vreplicate_imm(T_INT, $con$$constant, Matcher::vector_length(this)));
4171     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
4172   %}
4173   ins_pipe( pipe_slow );
4174 %}
4175 
4176 // Replicate scalar zero to be vector
4177 instruct ReplI_zero(vec dst, immI_0 zero) %{
4178   match(Set dst (ReplicateB zero));
4179   match(Set dst (ReplicateS zero));
4180   match(Set dst (ReplicateI zero));
4181   format %{ "replicateI $dst,$zero" %}
4182   ins_encode %{
4183     uint vsize = Matcher::vector_length_in_bytes(this);
4184     if (vsize <= 16) {
4185       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4186     } else {
4187       int vlen_enc = vector_length_encoding(this);
4188       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4189     }
4190   %}
4191   ins_pipe( fpu_reg_reg );
4192 %}
4193 
4194 instruct ReplI_M1(vec dst, immI_M1 con) %{
4195   predicate(UseAVX > 0 && Matcher::vector_length_in_bytes(n) >= 16);
4196   match(Set dst (ReplicateB con));
4197   match(Set dst (ReplicateS con));
4198   match(Set dst (ReplicateI con));
4199   effect(TEMP dst);
4200   format %{ "vallones $dst" %}
4201   ins_encode %{
4202     int vector_len = vector_length_encoding(this);
4203     __ vallones($dst$$XMMRegister, vector_len);
4204   %}
4205   ins_pipe( pipe_slow );
4206 %}
4207 
4208 // ====================ReplicateL=======================================
4209 
4210 #ifdef _LP64
4211 // Replicate long (8 byte) scalar to be vector
4212 instruct ReplL_reg(vec dst, rRegL src) %{
4213   match(Set dst (ReplicateL src));
4214   format %{ "replicateL $dst,$src" %}
4215   ins_encode %{
4216     uint vlen = Matcher::vector_length(this);
4217     if (vlen == 2) {
4218       __ movdq($dst$$XMMRegister, $src$$Register);
4219       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4220     } else if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4221       int vlen_enc = vector_length_encoding(this);
4222       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
4223     } else if (VM_Version::supports_avx2()) {
4224       assert(vlen == 4, "sanity");
4225       int vlen_enc = vector_length_encoding(this);
4226       __ movdq($dst$$XMMRegister, $src$$Register);
4227       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4228     } else {
4229       assert(vlen == 4, "sanity");
4230       __ movdq($dst$$XMMRegister, $src$$Register);
4231       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4232       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4233     }
4234   %}
4235   ins_pipe( pipe_slow );
4236 %}
4237 #else // _LP64
4238 // Replicate long (8 byte) scalar to be vector
4239 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
4240   predicate(Matcher::vector_length(n) <= 4);
4241   match(Set dst (ReplicateL src));
4242   effect(TEMP dst, USE src, TEMP tmp);
4243   format %{ "replicateL $dst,$src" %}
4244   ins_encode %{
4245     uint vlen = Matcher::vector_length(this);
4246     if (vlen == 2) {
4247       __ movdl($dst$$XMMRegister, $src$$Register);
4248       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4249       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4250       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4251     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
4252       int vlen_enc = Assembler::AVX_256bit;
4253       __ movdl($dst$$XMMRegister, $src$$Register);
4254       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4255       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4256       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4257     } else {
4258       __ movdl($dst$$XMMRegister, $src$$Register);
4259       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4260       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4261       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4262       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4263     }
4264   %}
4265   ins_pipe( pipe_slow );
4266 %}
4267 
4268 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
4269   predicate(Matcher::vector_length(n) == 8);
4270   match(Set dst (ReplicateL src));
4271   effect(TEMP dst, USE src, TEMP tmp);
4272   format %{ "replicateL $dst,$src" %}
4273   ins_encode %{
4274     if (VM_Version::supports_avx512vl()) {
4275       __ movdl($dst$$XMMRegister, $src$$Register);
4276       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4277       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4278       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4279       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
4280       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
4281     } else {
4282       int vlen_enc = Assembler::AVX_512bit;
4283       __ movdl($dst$$XMMRegister, $src$$Register);
4284       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4285       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4286       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4287     }
4288   %}
4289   ins_pipe( pipe_slow );
4290 %}
4291 #endif // _LP64
4292 
4293 instruct ReplL_mem(vec dst, memory mem) %{
4294   match(Set dst (ReplicateL (LoadL mem)));
4295   format %{ "replicateL $dst,$mem" %}
4296   ins_encode %{
4297     uint vlen = Matcher::vector_length(this);
4298     if (vlen == 2) {
4299       __ movq($dst$$XMMRegister, $mem$$Address);
4300       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4301     } else {
4302       assert(VM_Version::supports_avx2(), "sanity");
4303       int vlen_enc = vector_length_encoding(this);
4304       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
4305     }
4306   %}
4307   ins_pipe( pipe_slow );
4308 %}
4309 
4310 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4311 instruct ReplL_imm(vec dst, immL con) %{
4312   match(Set dst (ReplicateL con));
4313   format %{ "replicateL $dst,$con" %}
4314   ins_encode %{
4315     InternalAddress addr = $constantaddress(T_LONG, vreplicate_imm(T_LONG, $con$$constant, Matcher::vector_length(this)));
4316     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
4317   %}
4318   ins_pipe( pipe_slow );
4319 %}
4320 
4321 instruct ReplL_zero(vec dst, immL0 zero) %{
4322   match(Set dst (ReplicateL zero));
4323   format %{ "replicateL $dst,$zero" %}
4324   ins_encode %{
4325     int vlen = Matcher::vector_length(this);
4326     if (vlen == 2) {
4327       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4328     } else {
4329       int vlen_enc = vector_length_encoding(this);
4330       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4331     }
4332   %}
4333   ins_pipe( fpu_reg_reg );
4334 %}
4335 
4336 instruct ReplL_M1(vec dst, immL_M1 con) %{
4337   predicate(UseAVX > 0);
4338   match(Set dst (ReplicateL con));
4339   effect(TEMP dst);
4340   format %{ "vallones $dst" %}
4341   ins_encode %{
4342     int vector_len = vector_length_encoding(this);
4343     __ vallones($dst$$XMMRegister, vector_len);
4344   %}
4345   ins_pipe( pipe_slow );
4346 %}
4347 
4348 // ====================ReplicateF=======================================
4349 
4350 instruct ReplF_reg(vec dst, vlRegF src) %{
4351   match(Set dst (ReplicateF src));
4352   format %{ "replicateF $dst,$src" %}
4353   ins_encode %{
4354     uint vlen = Matcher::vector_length(this);
4355     if (vlen <= 4) {
4356       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4357    } else if (VM_Version::supports_avx2()) {
4358       int vlen_enc = vector_length_encoding(this);
4359       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4360     } else {
4361       assert(vlen == 8, "sanity");
4362       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4363       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4364     }
4365   %}
4366   ins_pipe( pipe_slow );
4367 %}
4368 
4369 instruct ReplF_mem(vec dst, memory mem) %{
4370   match(Set dst (ReplicateF (LoadF mem)));
4371   format %{ "replicateF $dst,$mem" %}
4372   ins_encode %{
4373     uint vlen = Matcher::vector_length(this);
4374     if (vlen <= 4) {
4375       __ movdl($dst$$XMMRegister, $mem$$Address);
4376       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4377     } else {
4378       assert(VM_Version::supports_avx(), "sanity");
4379       int vlen_enc = vector_length_encoding(this);
4380       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
4381     }
4382   %}
4383   ins_pipe( pipe_slow );
4384 %}
4385 
4386 // Replicate float scalar immediate to be vector by loading from const table.
4387 instruct ReplF_imm(vec dst, immF con) %{
4388   match(Set dst (ReplicateF con));
4389   format %{ "replicateF $dst,$con" %}
4390   ins_encode %{
4391     InternalAddress addr = $constantaddress(T_FLOAT, vreplicate_imm(T_FLOAT, $con$$constant, Matcher::vector_length(this)));
4392     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
4393   %}
4394   ins_pipe( pipe_slow );
4395 %}
4396 
4397 instruct ReplF_zero(vec dst, immF0 zero) %{
4398   match(Set dst (ReplicateF zero));
4399   format %{ "replicateF $dst,$zero" %}
4400   ins_encode %{
4401     uint vlen = Matcher::vector_length(this);
4402     if (vlen <= 4) {
4403       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4404     } else {
4405       int vlen_enc = vector_length_encoding(this);
4406       __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4407     }
4408   %}
4409   ins_pipe( fpu_reg_reg );
4410 %}
4411 
4412 // ====================ReplicateD=======================================
4413 
4414 // Replicate double (8 bytes) scalar to be vector
4415 instruct ReplD_reg(vec dst, vlRegD src) %{
4416   match(Set dst (ReplicateD src));
4417   format %{ "replicateD $dst,$src" %}
4418   ins_encode %{
4419     uint vlen = Matcher::vector_length(this);
4420     if (vlen == 2) {
4421       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4422     } else if (VM_Version::supports_avx2()) {
4423       int vlen_enc = vector_length_encoding(this);
4424       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4425     } else {
4426       assert(vlen == 4, "sanity");
4427       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4428       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4429     }
4430   %}
4431   ins_pipe( pipe_slow );
4432 %}
4433 
4434 instruct ReplD_mem(vec dst, memory mem) %{
4435   match(Set dst (ReplicateD (LoadD mem)));
4436   format %{ "replicateD $dst,$mem" %}
4437   ins_encode %{
4438     uint vlen = Matcher::vector_length(this);
4439     if (vlen == 2) {
4440       __ movq($dst$$XMMRegister, $mem$$Address);
4441       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44);
4442     } else {
4443       assert(VM_Version::supports_avx(), "sanity");
4444       int vlen_enc = vector_length_encoding(this);
4445       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4446     }
4447   %}
4448   ins_pipe( pipe_slow );
4449 %}
4450 
4451 // Replicate double (8 byte) scalar immediate to be vector by loading from const table.
4452 instruct ReplD_imm(vec dst, immD con) %{
4453   match(Set dst (ReplicateD con));
4454   format %{ "replicateD $dst,$con" %}
4455   ins_encode %{
4456     InternalAddress addr = $constantaddress(T_DOUBLE, vreplicate_imm(T_DOUBLE, $con$$constant, Matcher::vector_length(this)));
4457     __ load_vector($dst$$XMMRegister, addr, Matcher::vector_length_in_bytes(this));
4458   %}
4459   ins_pipe( pipe_slow );
4460 %}
4461 
4462 instruct ReplD_zero(vec dst, immD0 zero) %{
4463   match(Set dst (ReplicateD zero));
4464   format %{ "replicateD $dst,$zero" %}
4465   ins_encode %{
4466     uint vlen = Matcher::vector_length(this);
4467     if (vlen == 2) {
4468       __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4469     } else {
4470       int vlen_enc = vector_length_encoding(this);
4471       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4472     }
4473   %}
4474   ins_pipe( fpu_reg_reg );
4475 %}
4476 
4477 // ====================VECTOR INSERT=======================================
4478 
4479 instruct insert(vec dst, rRegI val, immU8 idx) %{
4480   predicate(Matcher::vector_length_in_bytes(n) < 32);
4481   match(Set dst (VectorInsert (Binary dst val) idx));
4482   format %{ "vector_insert $dst,$val,$idx" %}
4483   ins_encode %{
4484     assert(UseSSE >= 4, "required");
4485     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
4486 
4487     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4488 
4489     assert(is_integral_type(elem_bt), "");
4490     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4491 
4492     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
4493   %}
4494   ins_pipe( pipe_slow );
4495 %}
4496 
4497 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
4498   predicate(Matcher::vector_length_in_bytes(n) == 32);
4499   match(Set dst (VectorInsert (Binary src val) idx));
4500   effect(TEMP vtmp);
4501   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4502   ins_encode %{
4503     int vlen_enc = Assembler::AVX_256bit;
4504     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4505     int elem_per_lane = 16/type2aelembytes(elem_bt);
4506     int log2epr = log2(elem_per_lane);
4507 
4508     assert(is_integral_type(elem_bt), "sanity");
4509     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4510 
4511     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4512     uint y_idx = ($idx$$constant >> log2epr) & 1;
4513     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4514     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4515     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4516   %}
4517   ins_pipe( pipe_slow );
4518 %}
4519 
4520 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
4521   predicate(Matcher::vector_length_in_bytes(n) == 64);
4522   match(Set dst (VectorInsert (Binary src val) idx));
4523   effect(TEMP vtmp);
4524   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4525   ins_encode %{
4526     assert(UseAVX > 2, "sanity");
4527 
4528     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4529     int elem_per_lane = 16/type2aelembytes(elem_bt);
4530     int log2epr = log2(elem_per_lane);
4531 
4532     assert(is_integral_type(elem_bt), "");
4533     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4534 
4535     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4536     uint y_idx = ($idx$$constant >> log2epr) & 3;
4537     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4538     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4539     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4540   %}
4541   ins_pipe( pipe_slow );
4542 %}
4543 
4544 #ifdef _LP64
4545 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
4546   predicate(Matcher::vector_length(n) == 2);
4547   match(Set dst (VectorInsert (Binary dst val) idx));
4548   format %{ "vector_insert $dst,$val,$idx" %}
4549   ins_encode %{
4550     assert(UseSSE >= 4, "required");
4551     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4552     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4553 
4554     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
4555   %}
4556   ins_pipe( pipe_slow );
4557 %}
4558 
4559 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
4560   predicate(Matcher::vector_length(n) == 4);
4561   match(Set dst (VectorInsert (Binary src val) idx));
4562   effect(TEMP vtmp);
4563   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4564   ins_encode %{
4565     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4566     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4567 
4568     uint x_idx = $idx$$constant & right_n_bits(1);
4569     uint y_idx = ($idx$$constant >> 1) & 1;
4570     int vlen_enc = Assembler::AVX_256bit;
4571     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4572     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4573     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4574   %}
4575   ins_pipe( pipe_slow );
4576 %}
4577 
4578 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
4579   predicate(Matcher::vector_length(n) == 8);
4580   match(Set dst (VectorInsert (Binary src val) idx));
4581   effect(TEMP vtmp);
4582   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4583   ins_encode %{
4584     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
4585     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4586 
4587     uint x_idx = $idx$$constant & right_n_bits(1);
4588     uint y_idx = ($idx$$constant >> 1) & 3;
4589     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4590     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4591     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4592   %}
4593   ins_pipe( pipe_slow );
4594 %}
4595 #endif
4596 
4597 instruct insertF(vec dst, regF val, immU8 idx) %{
4598   predicate(Matcher::vector_length(n) < 8);
4599   match(Set dst (VectorInsert (Binary dst val) idx));
4600   format %{ "vector_insert $dst,$val,$idx" %}
4601   ins_encode %{
4602     assert(UseSSE >= 4, "sanity");
4603 
4604     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4605     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4606 
4607     uint x_idx = $idx$$constant & right_n_bits(2);
4608     __ insertps($dst$$XMMRegister, $val$$XMMRegister, x_idx << 4);
4609   %}
4610   ins_pipe( pipe_slow );
4611 %}
4612 
4613 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
4614   predicate(Matcher::vector_length(n) >= 8);
4615   match(Set dst (VectorInsert (Binary src val) idx));
4616   effect(TEMP vtmp);
4617   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4618   ins_encode %{
4619     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4620     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4621 
4622     int vlen = Matcher::vector_length(this);
4623     uint x_idx = $idx$$constant & right_n_bits(2);
4624     if (vlen == 8) {
4625       uint y_idx = ($idx$$constant >> 2) & 1;
4626       int vlen_enc = Assembler::AVX_256bit;
4627       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4628       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
4629       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4630     } else {
4631       assert(vlen == 16, "sanity");
4632       uint y_idx = ($idx$$constant >> 2) & 3;
4633       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4634       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx << 4);
4635       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4636     }
4637   %}
4638   ins_pipe( pipe_slow );
4639 %}
4640 
4641 #ifdef _LP64
4642 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
4643   predicate(Matcher::vector_length(n) == 2);
4644   match(Set dst (VectorInsert (Binary dst val) idx));
4645   effect(TEMP tmp);
4646   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
4647   ins_encode %{
4648     assert(UseSSE >= 4, "sanity");
4649     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4650     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4651 
4652     __ movq($tmp$$Register, $val$$XMMRegister);
4653     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
4654   %}
4655   ins_pipe( pipe_slow );
4656 %}
4657 
4658 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
4659   predicate(Matcher::vector_length(n) == 4);
4660   match(Set dst (VectorInsert (Binary src val) idx));
4661   effect(TEMP vtmp, TEMP tmp);
4662   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
4663   ins_encode %{
4664     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4665     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4666 
4667     uint x_idx = $idx$$constant & right_n_bits(1);
4668     uint y_idx = ($idx$$constant >> 1) & 1;
4669     int vlen_enc = Assembler::AVX_256bit;
4670     __ movq($tmp$$Register, $val$$XMMRegister);
4671     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4672     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4673     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4674   %}
4675   ins_pipe( pipe_slow );
4676 %}
4677 
4678 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
4679   predicate(Matcher::vector_length(n) == 8);
4680   match(Set dst (VectorInsert (Binary src val) idx));
4681   effect(TEMP tmp, TEMP vtmp);
4682   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4683   ins_encode %{
4684     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4685     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4686 
4687     uint x_idx = $idx$$constant & right_n_bits(1);
4688     uint y_idx = ($idx$$constant >> 1) & 3;
4689     __ movq($tmp$$Register, $val$$XMMRegister);
4690     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4691     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4692     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4693   %}
4694   ins_pipe( pipe_slow );
4695 %}
4696 #endif
4697 
4698 // ====================REDUCTION ARITHMETIC=======================================
4699 
4700 // =======================Int Reduction==========================================
4701 
4702 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4703   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
4704   match(Set dst (AddReductionVI src1 src2));
4705   match(Set dst (MulReductionVI src1 src2));
4706   match(Set dst (AndReductionV  src1 src2));
4707   match(Set dst ( OrReductionV  src1 src2));
4708   match(Set dst (XorReductionV  src1 src2));
4709   match(Set dst (MinReductionV  src1 src2));
4710   match(Set dst (MaxReductionV  src1 src2));
4711   effect(TEMP vtmp1, TEMP vtmp2);
4712   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4713   ins_encode %{
4714     int opcode = this->ideal_Opcode();
4715     int vlen = Matcher::vector_length(this, $src2);
4716     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4717   %}
4718   ins_pipe( pipe_slow );
4719 %}
4720 
4721 // =======================Long Reduction==========================================
4722 
4723 #ifdef _LP64
4724 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4725   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
4726   match(Set dst (AddReductionVL src1 src2));
4727   match(Set dst (MulReductionVL src1 src2));
4728   match(Set dst (AndReductionV  src1 src2));
4729   match(Set dst ( OrReductionV  src1 src2));
4730   match(Set dst (XorReductionV  src1 src2));
4731   match(Set dst (MinReductionV  src1 src2));
4732   match(Set dst (MaxReductionV  src1 src2));
4733   effect(TEMP vtmp1, TEMP vtmp2);
4734   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4735   ins_encode %{
4736     int opcode = this->ideal_Opcode();
4737     int vlen = Matcher::vector_length(this, $src2);
4738     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4739   %}
4740   ins_pipe( pipe_slow );
4741 %}
4742 
4743 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
4744   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
4745   match(Set dst (AddReductionVL src1 src2));
4746   match(Set dst (MulReductionVL src1 src2));
4747   match(Set dst (AndReductionV  src1 src2));
4748   match(Set dst ( OrReductionV  src1 src2));
4749   match(Set dst (XorReductionV  src1 src2));
4750   match(Set dst (MinReductionV  src1 src2));
4751   match(Set dst (MaxReductionV  src1 src2));
4752   effect(TEMP vtmp1, TEMP vtmp2);
4753   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4754   ins_encode %{
4755     int opcode = this->ideal_Opcode();
4756     int vlen = Matcher::vector_length(this, $src2);
4757     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4758   %}
4759   ins_pipe( pipe_slow );
4760 %}
4761 #endif // _LP64
4762 
4763 // =======================Float Reduction==========================================
4764 
4765 instruct reductionF128(regF dst, vec src, vec vtmp) %{
4766   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
4767   match(Set dst (AddReductionVF dst src));
4768   match(Set dst (MulReductionVF dst src));
4769   effect(TEMP dst, TEMP vtmp);
4770   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
4771   ins_encode %{
4772     int opcode = this->ideal_Opcode();
4773     int vlen = Matcher::vector_length(this, $src);
4774     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4775   %}
4776   ins_pipe( pipe_slow );
4777 %}
4778 
4779 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
4780   predicate(Matcher::vector_length(n->in(2)) == 8); // src
4781   match(Set dst (AddReductionVF dst src));
4782   match(Set dst (MulReductionVF dst src));
4783   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4784   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4785   ins_encode %{
4786     int opcode = this->ideal_Opcode();
4787     int vlen = Matcher::vector_length(this, $src);
4788     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4789   %}
4790   ins_pipe( pipe_slow );
4791 %}
4792 
4793 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4794   predicate(Matcher::vector_length(n->in(2)) == 16); // src
4795   match(Set dst (AddReductionVF dst src));
4796   match(Set dst (MulReductionVF dst src));
4797   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4798   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4799   ins_encode %{
4800     int opcode = this->ideal_Opcode();
4801     int vlen = Matcher::vector_length(this, $src);
4802     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4803   %}
4804   ins_pipe( pipe_slow );
4805 %}
4806 
4807 // =======================Double Reduction==========================================
4808 
4809 instruct reduction2D(regD dst, vec src, vec vtmp) %{
4810   predicate(Matcher::vector_length(n->in(2)) == 2); // src
4811   match(Set dst (AddReductionVD dst src));
4812   match(Set dst (MulReductionVD dst src));
4813   effect(TEMP dst, TEMP vtmp);
4814   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
4815   ins_encode %{
4816     int opcode = this->ideal_Opcode();
4817     int vlen = Matcher::vector_length(this, $src);
4818     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4819 %}
4820   ins_pipe( pipe_slow );
4821 %}
4822 
4823 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
4824   predicate(Matcher::vector_length(n->in(2)) == 4); // src
4825   match(Set dst (AddReductionVD dst src));
4826   match(Set dst (MulReductionVD dst src));
4827   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4828   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4829   ins_encode %{
4830     int opcode = this->ideal_Opcode();
4831     int vlen = Matcher::vector_length(this, $src);
4832     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4833   %}
4834   ins_pipe( pipe_slow );
4835 %}
4836 
4837 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4838   predicate(Matcher::vector_length(n->in(2)) == 8); // src
4839   match(Set dst (AddReductionVD dst src));
4840   match(Set dst (MulReductionVD dst src));
4841   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4842   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4843   ins_encode %{
4844     int opcode = this->ideal_Opcode();
4845     int vlen = Matcher::vector_length(this, $src);
4846     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4847   %}
4848   ins_pipe( pipe_slow );
4849 %}
4850 
4851 // =======================Byte Reduction==========================================
4852 
4853 #ifdef _LP64
4854 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4855   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
4856   match(Set dst (AddReductionVI src1 src2));
4857   match(Set dst (AndReductionV  src1 src2));
4858   match(Set dst ( OrReductionV  src1 src2));
4859   match(Set dst (XorReductionV  src1 src2));
4860   match(Set dst (MinReductionV  src1 src2));
4861   match(Set dst (MaxReductionV  src1 src2));
4862   effect(TEMP vtmp1, TEMP vtmp2);
4863   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4864   ins_encode %{
4865     int opcode = this->ideal_Opcode();
4866     int vlen = Matcher::vector_length(this, $src2);
4867     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4868   %}
4869   ins_pipe( pipe_slow );
4870 %}
4871 
4872 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4873   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
4874   match(Set dst (AddReductionVI src1 src2));
4875   match(Set dst (AndReductionV  src1 src2));
4876   match(Set dst ( OrReductionV  src1 src2));
4877   match(Set dst (XorReductionV  src1 src2));
4878   match(Set dst (MinReductionV  src1 src2));
4879   match(Set dst (MaxReductionV  src1 src2));
4880   effect(TEMP vtmp1, TEMP vtmp2);
4881   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4882   ins_encode %{
4883     int opcode = this->ideal_Opcode();
4884     int vlen = Matcher::vector_length(this, $src2);
4885     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4886   %}
4887   ins_pipe( pipe_slow );
4888 %}
4889 #endif
4890 
4891 // =======================Short Reduction==========================================
4892 
4893 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4894   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
4895   match(Set dst (AddReductionVI src1 src2));
4896   match(Set dst (MulReductionVI src1 src2));
4897   match(Set dst (AndReductionV  src1 src2));
4898   match(Set dst ( OrReductionV  src1 src2));
4899   match(Set dst (XorReductionV  src1 src2));
4900   match(Set dst (MinReductionV  src1 src2));
4901   match(Set dst (MaxReductionV  src1 src2));
4902   effect(TEMP vtmp1, TEMP vtmp2);
4903   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4904   ins_encode %{
4905     int opcode = this->ideal_Opcode();
4906     int vlen = Matcher::vector_length(this, $src2);
4907     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4908   %}
4909   ins_pipe( pipe_slow );
4910 %}
4911 
4912 // =======================Mul Reduction==========================================
4913 
4914 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4915   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
4916             Matcher::vector_length(n->in(2)) <= 32); // src2
4917   match(Set dst (MulReductionVI src1 src2));
4918   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4919   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4920   ins_encode %{
4921     int opcode = this->ideal_Opcode();
4922     int vlen = Matcher::vector_length(this, $src2);
4923     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4924   %}
4925   ins_pipe( pipe_slow );
4926 %}
4927 
4928 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4929   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
4930             Matcher::vector_length(n->in(2)) == 64); // src2
4931   match(Set dst (MulReductionVI src1 src2));
4932   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4933   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4934   ins_encode %{
4935     int opcode = this->ideal_Opcode();
4936     int vlen = Matcher::vector_length(this, $src2);
4937     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4938   %}
4939   ins_pipe( pipe_slow );
4940 %}
4941 
4942 //--------------------Min/Max Float Reduction --------------------
4943 // Float Min Reduction
4944 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
4945                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4946   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4947             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4948              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4949             Matcher::vector_length(n->in(2)) == 2);
4950   match(Set dst (MinReductionV src1 src2));
4951   match(Set dst (MaxReductionV src1 src2));
4952   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4953   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4954   ins_encode %{
4955     assert(UseAVX > 0, "sanity");
4956 
4957     int opcode = this->ideal_Opcode();
4958     int vlen = Matcher::vector_length(this, $src2);
4959     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4960                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4961   %}
4962   ins_pipe( pipe_slow );
4963 %}
4964 
4965 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
4966                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4967   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4968             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4969              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4970             Matcher::vector_length(n->in(2)) >= 4);
4971   match(Set dst (MinReductionV src1 src2));
4972   match(Set dst (MaxReductionV src1 src2));
4973   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4974   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4975   ins_encode %{
4976     assert(UseAVX > 0, "sanity");
4977 
4978     int opcode = this->ideal_Opcode();
4979     int vlen = Matcher::vector_length(this, $src2);
4980     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4981                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4982   %}
4983   ins_pipe( pipe_slow );
4984 %}
4985 
4986 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
4987                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4988   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4989             Matcher::vector_length(n->in(2)) == 2);
4990   match(Set dst (MinReductionV dst src));
4991   match(Set dst (MaxReductionV dst src));
4992   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4993   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4994   ins_encode %{
4995     assert(UseAVX > 0, "sanity");
4996 
4997     int opcode = this->ideal_Opcode();
4998     int vlen = Matcher::vector_length(this, $src);
4999     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
5000                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
5001   %}
5002   ins_pipe( pipe_slow );
5003 %}
5004 
5005 
5006 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
5007                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
5008   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
5009             Matcher::vector_length(n->in(2)) >= 4);
5010   match(Set dst (MinReductionV dst src));
5011   match(Set dst (MaxReductionV dst src));
5012   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
5013   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
5014   ins_encode %{
5015     assert(UseAVX > 0, "sanity");
5016 
5017     int opcode = this->ideal_Opcode();
5018     int vlen = Matcher::vector_length(this, $src);
5019     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
5020                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
5021   %}
5022   ins_pipe( pipe_slow );
5023 %}
5024 
5025 
5026 //--------------------Min Double Reduction --------------------
5027 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
5028                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
5029                             rFlagsReg cr) %{
5030   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5031             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
5032              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
5033             Matcher::vector_length(n->in(2)) == 2);
5034   match(Set dst (MinReductionV src1 src2));
5035   match(Set dst (MaxReductionV src1 src2));
5036   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
5037   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
5038   ins_encode %{
5039     assert(UseAVX > 0, "sanity");
5040 
5041     int opcode = this->ideal_Opcode();
5042     int vlen = Matcher::vector_length(this, $src2);
5043     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
5044                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
5045   %}
5046   ins_pipe( pipe_slow );
5047 %}
5048 
5049 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
5050                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
5051                            rFlagsReg cr) %{
5052   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5053             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
5054              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
5055             Matcher::vector_length(n->in(2)) >= 4);
5056   match(Set dst (MinReductionV src1 src2));
5057   match(Set dst (MaxReductionV src1 src2));
5058   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
5059   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
5060   ins_encode %{
5061     assert(UseAVX > 0, "sanity");
5062 
5063     int opcode = this->ideal_Opcode();
5064     int vlen = Matcher::vector_length(this, $src2);
5065     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
5066                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
5067   %}
5068   ins_pipe( pipe_slow );
5069 %}
5070 
5071 
5072 instruct minmax_reduction2D_av(legRegD dst, legVec src,
5073                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
5074                                rFlagsReg cr) %{
5075   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5076             Matcher::vector_length(n->in(2)) == 2);
5077   match(Set dst (MinReductionV dst src));
5078   match(Set dst (MaxReductionV dst src));
5079   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
5080   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
5081   ins_encode %{
5082     assert(UseAVX > 0, "sanity");
5083 
5084     int opcode = this->ideal_Opcode();
5085     int vlen = Matcher::vector_length(this, $src);
5086     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
5087                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
5088   %}
5089   ins_pipe( pipe_slow );
5090 %}
5091 
5092 instruct minmax_reductionD_av(legRegD dst, legVec src,
5093                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
5094                               rFlagsReg cr) %{
5095   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
5096             Matcher::vector_length(n->in(2)) >= 4);
5097   match(Set dst (MinReductionV dst src));
5098   match(Set dst (MaxReductionV dst src));
5099   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
5100   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
5101   ins_encode %{
5102     assert(UseAVX > 0, "sanity");
5103 
5104     int opcode = this->ideal_Opcode();
5105     int vlen = Matcher::vector_length(this, $src);
5106     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
5107                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
5108   %}
5109   ins_pipe( pipe_slow );
5110 %}
5111 
5112 // ====================VECTOR ARITHMETIC=======================================
5113 
5114 // --------------------------------- ADD --------------------------------------
5115 
5116 // Bytes vector add
5117 instruct vaddB(vec dst, vec src) %{
5118   predicate(UseAVX == 0);
5119   match(Set dst (AddVB dst src));
5120   format %{ "paddb   $dst,$src\t! add packedB" %}
5121   ins_encode %{
5122     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5123   %}
5124   ins_pipe( pipe_slow );
5125 %}
5126 
5127 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
5128   predicate(UseAVX > 0);
5129   match(Set dst (AddVB src1 src2));
5130   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
5131   ins_encode %{
5132     int vlen_enc = vector_length_encoding(this);
5133     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5134   %}
5135   ins_pipe( pipe_slow );
5136 %}
5137 
5138 instruct vaddB_mem(vec dst, vec src, memory mem) %{
5139   predicate((UseAVX > 0) &&
5140             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5141   match(Set dst (AddVB src (LoadVector mem)));
5142   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
5143   ins_encode %{
5144     int vlen_enc = vector_length_encoding(this);
5145     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5146   %}
5147   ins_pipe( pipe_slow );
5148 %}
5149 
5150 // Shorts/Chars vector add
5151 instruct vaddS(vec dst, vec src) %{
5152   predicate(UseAVX == 0);
5153   match(Set dst (AddVS dst src));
5154   format %{ "paddw   $dst,$src\t! add packedS" %}
5155   ins_encode %{
5156     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
5157   %}
5158   ins_pipe( pipe_slow );
5159 %}
5160 
5161 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
5162   predicate(UseAVX > 0);
5163   match(Set dst (AddVS src1 src2));
5164   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
5165   ins_encode %{
5166     int vlen_enc = vector_length_encoding(this);
5167     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5168   %}
5169   ins_pipe( pipe_slow );
5170 %}
5171 
5172 instruct vaddS_mem(vec dst, vec src, memory mem) %{
5173   predicate((UseAVX > 0) &&
5174             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5175   match(Set dst (AddVS src (LoadVector mem)));
5176   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
5177   ins_encode %{
5178     int vlen_enc = vector_length_encoding(this);
5179     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5180   %}
5181   ins_pipe( pipe_slow );
5182 %}
5183 
5184 // Integers vector add
5185 instruct vaddI(vec dst, vec src) %{
5186   predicate(UseAVX == 0);
5187   match(Set dst (AddVI dst src));
5188   format %{ "paddd   $dst,$src\t! add packedI" %}
5189   ins_encode %{
5190     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
5191   %}
5192   ins_pipe( pipe_slow );
5193 %}
5194 
5195 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
5196   predicate(UseAVX > 0);
5197   match(Set dst (AddVI src1 src2));
5198   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
5199   ins_encode %{
5200     int vlen_enc = vector_length_encoding(this);
5201     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5202   %}
5203   ins_pipe( pipe_slow );
5204 %}
5205 
5206 
5207 instruct vaddI_mem(vec dst, vec src, memory mem) %{
5208   predicate((UseAVX > 0) &&
5209             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5210   match(Set dst (AddVI src (LoadVector mem)));
5211   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
5212   ins_encode %{
5213     int vlen_enc = vector_length_encoding(this);
5214     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5215   %}
5216   ins_pipe( pipe_slow );
5217 %}
5218 
5219 // Longs vector add
5220 instruct vaddL(vec dst, vec src) %{
5221   predicate(UseAVX == 0);
5222   match(Set dst (AddVL dst src));
5223   format %{ "paddq   $dst,$src\t! add packedL" %}
5224   ins_encode %{
5225     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
5226   %}
5227   ins_pipe( pipe_slow );
5228 %}
5229 
5230 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
5231   predicate(UseAVX > 0);
5232   match(Set dst (AddVL src1 src2));
5233   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
5234   ins_encode %{
5235     int vlen_enc = vector_length_encoding(this);
5236     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5237   %}
5238   ins_pipe( pipe_slow );
5239 %}
5240 
5241 instruct vaddL_mem(vec dst, vec src, memory mem) %{
5242   predicate((UseAVX > 0) &&
5243             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5244   match(Set dst (AddVL src (LoadVector mem)));
5245   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
5246   ins_encode %{
5247     int vlen_enc = vector_length_encoding(this);
5248     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5249   %}
5250   ins_pipe( pipe_slow );
5251 %}
5252 
5253 // Floats vector add
5254 instruct vaddF(vec dst, vec src) %{
5255   predicate(UseAVX == 0);
5256   match(Set dst (AddVF dst src));
5257   format %{ "addps   $dst,$src\t! add packedF" %}
5258   ins_encode %{
5259     __ addps($dst$$XMMRegister, $src$$XMMRegister);
5260   %}
5261   ins_pipe( pipe_slow );
5262 %}
5263 
5264 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
5265   predicate(UseAVX > 0);
5266   match(Set dst (AddVF src1 src2));
5267   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
5268   ins_encode %{
5269     int vlen_enc = vector_length_encoding(this);
5270     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5271   %}
5272   ins_pipe( pipe_slow );
5273 %}
5274 
5275 instruct vaddF_mem(vec dst, vec src, memory mem) %{
5276   predicate((UseAVX > 0) &&
5277             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5278   match(Set dst (AddVF src (LoadVector mem)));
5279   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
5280   ins_encode %{
5281     int vlen_enc = vector_length_encoding(this);
5282     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5283   %}
5284   ins_pipe( pipe_slow );
5285 %}
5286 
5287 // Doubles vector add
5288 instruct vaddD(vec dst, vec src) %{
5289   predicate(UseAVX == 0);
5290   match(Set dst (AddVD dst src));
5291   format %{ "addpd   $dst,$src\t! add packedD" %}
5292   ins_encode %{
5293     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
5294   %}
5295   ins_pipe( pipe_slow );
5296 %}
5297 
5298 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
5299   predicate(UseAVX > 0);
5300   match(Set dst (AddVD src1 src2));
5301   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
5302   ins_encode %{
5303     int vlen_enc = vector_length_encoding(this);
5304     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5305   %}
5306   ins_pipe( pipe_slow );
5307 %}
5308 
5309 instruct vaddD_mem(vec dst, vec src, memory mem) %{
5310   predicate((UseAVX > 0) &&
5311             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5312   match(Set dst (AddVD src (LoadVector mem)));
5313   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
5314   ins_encode %{
5315     int vlen_enc = vector_length_encoding(this);
5316     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5317   %}
5318   ins_pipe( pipe_slow );
5319 %}
5320 
5321 // --------------------------------- SUB --------------------------------------
5322 
5323 // Bytes vector sub
5324 instruct vsubB(vec dst, vec src) %{
5325   predicate(UseAVX == 0);
5326   match(Set dst (SubVB dst src));
5327   format %{ "psubb   $dst,$src\t! sub packedB" %}
5328   ins_encode %{
5329     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
5330   %}
5331   ins_pipe( pipe_slow );
5332 %}
5333 
5334 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
5335   predicate(UseAVX > 0);
5336   match(Set dst (SubVB src1 src2));
5337   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
5338   ins_encode %{
5339     int vlen_enc = vector_length_encoding(this);
5340     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5341   %}
5342   ins_pipe( pipe_slow );
5343 %}
5344 
5345 instruct vsubB_mem(vec dst, vec src, memory mem) %{
5346   predicate((UseAVX > 0) &&
5347             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5348   match(Set dst (SubVB src (LoadVector mem)));
5349   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
5350   ins_encode %{
5351     int vlen_enc = vector_length_encoding(this);
5352     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5353   %}
5354   ins_pipe( pipe_slow );
5355 %}
5356 
5357 // Shorts/Chars vector sub
5358 instruct vsubS(vec dst, vec src) %{
5359   predicate(UseAVX == 0);
5360   match(Set dst (SubVS dst src));
5361   format %{ "psubw   $dst,$src\t! sub packedS" %}
5362   ins_encode %{
5363     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
5364   %}
5365   ins_pipe( pipe_slow );
5366 %}
5367 
5368 
5369 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
5370   predicate(UseAVX > 0);
5371   match(Set dst (SubVS src1 src2));
5372   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
5373   ins_encode %{
5374     int vlen_enc = vector_length_encoding(this);
5375     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5376   %}
5377   ins_pipe( pipe_slow );
5378 %}
5379 
5380 instruct vsubS_mem(vec dst, vec src, memory mem) %{
5381   predicate((UseAVX > 0) &&
5382             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5383   match(Set dst (SubVS src (LoadVector mem)));
5384   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
5385   ins_encode %{
5386     int vlen_enc = vector_length_encoding(this);
5387     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5388   %}
5389   ins_pipe( pipe_slow );
5390 %}
5391 
5392 // Integers vector sub
5393 instruct vsubI(vec dst, vec src) %{
5394   predicate(UseAVX == 0);
5395   match(Set dst (SubVI dst src));
5396   format %{ "psubd   $dst,$src\t! sub packedI" %}
5397   ins_encode %{
5398     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
5399   %}
5400   ins_pipe( pipe_slow );
5401 %}
5402 
5403 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
5404   predicate(UseAVX > 0);
5405   match(Set dst (SubVI src1 src2));
5406   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
5407   ins_encode %{
5408     int vlen_enc = vector_length_encoding(this);
5409     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5410   %}
5411   ins_pipe( pipe_slow );
5412 %}
5413 
5414 instruct vsubI_mem(vec dst, vec src, memory mem) %{
5415   predicate((UseAVX > 0) &&
5416             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5417   match(Set dst (SubVI src (LoadVector mem)));
5418   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
5419   ins_encode %{
5420     int vlen_enc = vector_length_encoding(this);
5421     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5422   %}
5423   ins_pipe( pipe_slow );
5424 %}
5425 
5426 // Longs vector sub
5427 instruct vsubL(vec dst, vec src) %{
5428   predicate(UseAVX == 0);
5429   match(Set dst (SubVL dst src));
5430   format %{ "psubq   $dst,$src\t! sub packedL" %}
5431   ins_encode %{
5432     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
5433   %}
5434   ins_pipe( pipe_slow );
5435 %}
5436 
5437 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
5438   predicate(UseAVX > 0);
5439   match(Set dst (SubVL src1 src2));
5440   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
5441   ins_encode %{
5442     int vlen_enc = vector_length_encoding(this);
5443     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5444   %}
5445   ins_pipe( pipe_slow );
5446 %}
5447 
5448 
5449 instruct vsubL_mem(vec dst, vec src, memory mem) %{
5450   predicate((UseAVX > 0) &&
5451             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5452   match(Set dst (SubVL src (LoadVector mem)));
5453   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
5454   ins_encode %{
5455     int vlen_enc = vector_length_encoding(this);
5456     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5457   %}
5458   ins_pipe( pipe_slow );
5459 %}
5460 
5461 // Floats vector sub
5462 instruct vsubF(vec dst, vec src) %{
5463   predicate(UseAVX == 0);
5464   match(Set dst (SubVF dst src));
5465   format %{ "subps   $dst,$src\t! sub packedF" %}
5466   ins_encode %{
5467     __ subps($dst$$XMMRegister, $src$$XMMRegister);
5468   %}
5469   ins_pipe( pipe_slow );
5470 %}
5471 
5472 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
5473   predicate(UseAVX > 0);
5474   match(Set dst (SubVF src1 src2));
5475   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
5476   ins_encode %{
5477     int vlen_enc = vector_length_encoding(this);
5478     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5479   %}
5480   ins_pipe( pipe_slow );
5481 %}
5482 
5483 instruct vsubF_mem(vec dst, vec src, memory mem) %{
5484   predicate((UseAVX > 0) &&
5485             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5486   match(Set dst (SubVF src (LoadVector mem)));
5487   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
5488   ins_encode %{
5489     int vlen_enc = vector_length_encoding(this);
5490     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5491   %}
5492   ins_pipe( pipe_slow );
5493 %}
5494 
5495 // Doubles vector sub
5496 instruct vsubD(vec dst, vec src) %{
5497   predicate(UseAVX == 0);
5498   match(Set dst (SubVD dst src));
5499   format %{ "subpd   $dst,$src\t! sub packedD" %}
5500   ins_encode %{
5501     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
5502   %}
5503   ins_pipe( pipe_slow );
5504 %}
5505 
5506 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
5507   predicate(UseAVX > 0);
5508   match(Set dst (SubVD src1 src2));
5509   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
5510   ins_encode %{
5511     int vlen_enc = vector_length_encoding(this);
5512     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5513   %}
5514   ins_pipe( pipe_slow );
5515 %}
5516 
5517 instruct vsubD_mem(vec dst, vec src, memory mem) %{
5518   predicate((UseAVX > 0) &&
5519             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5520   match(Set dst (SubVD src (LoadVector mem)));
5521   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
5522   ins_encode %{
5523     int vlen_enc = vector_length_encoding(this);
5524     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5525   %}
5526   ins_pipe( pipe_slow );
5527 %}
5528 
5529 // --------------------------------- MUL --------------------------------------
5530 
5531 // Byte vector mul
5532 instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5533   predicate(Matcher::vector_length(n) == 4 ||
5534             Matcher::vector_length(n) == 8);
5535   match(Set dst (MulVB src1 src2));
5536   effect(TEMP dst, TEMP tmp, TEMP scratch);
5537   format %{"vector_mulB $dst,$src1,$src2" %}
5538   ins_encode %{
5539     assert(UseSSE > 3, "required");
5540     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
5541     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
5542     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
5543     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5544     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5545     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5546   %}
5547   ins_pipe( pipe_slow );
5548 %}
5549 
5550 instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5551   predicate(Matcher::vector_length(n) == 16 && UseAVX <= 1);
5552   match(Set dst (MulVB src1 src2));
5553   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5554   format %{"vector_mulB $dst,$src1,$src2" %}
5555   ins_encode %{
5556     assert(UseSSE > 3, "required");
5557     __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
5558     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
5559     __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
5560     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
5561     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
5562     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
5563     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
5564     __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
5565     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5566     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5567     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5568     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5569   %}
5570   ins_pipe( pipe_slow );
5571 %}
5572 
5573 instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5574   predicate(Matcher::vector_length(n) == 16 && UseAVX > 1);
5575   match(Set dst (MulVB src1 src2));
5576   effect(TEMP dst, TEMP tmp, TEMP scratch);
5577   format %{"vector_mulB $dst,$src1,$src2" %}
5578   ins_encode %{
5579   int vlen_enc = Assembler::AVX_256bit;
5580     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5581     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5582     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5583     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5584     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5585     __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
5586     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
5587   %}
5588   ins_pipe( pipe_slow );
5589 %}
5590 
5591 instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5592   predicate(Matcher::vector_length(n) == 32);
5593   match(Set dst (MulVB src1 src2));
5594   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5595   format %{"vector_mulB $dst,$src1,$src2" %}
5596   ins_encode %{
5597     assert(UseAVX > 1, "required");
5598     int vlen_enc = Assembler::AVX_256bit;
5599     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5600     __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
5601     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5602     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5603     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5604     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5605     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5606     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5607     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5608     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5609     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5610     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5611     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5612     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
5613   %}
5614   ins_pipe( pipe_slow );
5615 %}
5616 
5617 instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5618   predicate(Matcher::vector_length(n) == 64);
5619   match(Set dst (MulVB src1 src2));
5620   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5621   format %{"vector_mulB $dst,$src1,$src2\n\t" %}
5622   ins_encode %{
5623     assert(UseAVX > 2, "required");
5624     int vlen_enc = Assembler::AVX_512bit;
5625     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5626     __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
5627     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5628     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5629     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5630     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5631     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5632     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5633     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5634     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5635     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5636     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5637     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5638     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
5639     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5640   %}
5641   ins_pipe( pipe_slow );
5642 %}
5643 
5644 // Shorts/Chars vector mul
5645 instruct vmulS(vec dst, vec src) %{
5646   predicate(UseAVX == 0);
5647   match(Set dst (MulVS dst src));
5648   format %{ "pmullw $dst,$src\t! mul packedS" %}
5649   ins_encode %{
5650     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
5651   %}
5652   ins_pipe( pipe_slow );
5653 %}
5654 
5655 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
5656   predicate(UseAVX > 0);
5657   match(Set dst (MulVS src1 src2));
5658   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
5659   ins_encode %{
5660     int vlen_enc = vector_length_encoding(this);
5661     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5662   %}
5663   ins_pipe( pipe_slow );
5664 %}
5665 
5666 instruct vmulS_mem(vec dst, vec src, memory mem) %{
5667   predicate((UseAVX > 0) &&
5668             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5669   match(Set dst (MulVS src (LoadVector mem)));
5670   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
5671   ins_encode %{
5672     int vlen_enc = vector_length_encoding(this);
5673     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5674   %}
5675   ins_pipe( pipe_slow );
5676 %}
5677 
5678 // Integers vector mul
5679 instruct vmulI(vec dst, vec src) %{
5680   predicate(UseAVX == 0);
5681   match(Set dst (MulVI dst src));
5682   format %{ "pmulld  $dst,$src\t! mul packedI" %}
5683   ins_encode %{
5684     assert(UseSSE > 3, "required");
5685     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
5686   %}
5687   ins_pipe( pipe_slow );
5688 %}
5689 
5690 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
5691   predicate(UseAVX > 0);
5692   match(Set dst (MulVI src1 src2));
5693   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
5694   ins_encode %{
5695     int vlen_enc = vector_length_encoding(this);
5696     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5697   %}
5698   ins_pipe( pipe_slow );
5699 %}
5700 
5701 instruct vmulI_mem(vec dst, vec src, memory mem) %{
5702   predicate((UseAVX > 0) &&
5703             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5704   match(Set dst (MulVI src (LoadVector mem)));
5705   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
5706   ins_encode %{
5707     int vlen_enc = vector_length_encoding(this);
5708     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5709   %}
5710   ins_pipe( pipe_slow );
5711 %}
5712 
5713 // Longs vector mul
5714 instruct vmulL_reg(vec dst, vec src1, vec src2) %{
5715   predicate(VM_Version::supports_avx512dq());
5716   match(Set dst (MulVL src1 src2));
5717   format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
5718   ins_encode %{
5719     assert(UseAVX > 2, "required");
5720     int vlen_enc = vector_length_encoding(this);
5721     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5722   %}
5723   ins_pipe( pipe_slow );
5724 %}
5725 
5726 instruct vmulL_mem(vec dst, vec src, memory mem) %{
5727   predicate(VM_Version::supports_avx512dq() &&
5728               (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5729   match(Set dst (MulVL src (LoadVector mem)));
5730   format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
5731   ins_encode %{
5732     assert(UseAVX > 2, "required");
5733     int vlen_enc = vector_length_encoding(this);
5734     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5735   %}
5736   ins_pipe( pipe_slow );
5737 %}
5738 
5739 instruct mul2L_reg(vec dst, vec src2, legVec tmp) %{
5740   predicate(Matcher::vector_length(n) == 2 && !VM_Version::supports_avx512dq());
5741   match(Set dst (MulVL dst src2));
5742   effect(TEMP dst, TEMP tmp);
5743   format %{ "pshufd $tmp,$src2, 177\n\t"
5744             "pmulld $tmp,$dst\n\t"
5745             "phaddd $tmp,$tmp\n\t"
5746             "pmovzxdq $tmp,$tmp\n\t"
5747             "psllq $tmp, 32\n\t"
5748             "pmuludq $dst,$src2\n\t"
5749             "paddq $dst,$tmp\n\t! mul packed2L" %}
5750 
5751   ins_encode %{
5752     assert(VM_Version::supports_sse4_1(), "required");
5753     int vlen_enc = Assembler::AVX_128bit;
5754     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
5755     __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
5756     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5757     __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
5758     __ psllq($tmp$$XMMRegister, 32);
5759     __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
5760     __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
5761   %}
5762   ins_pipe( pipe_slow );
5763 %}
5764 
5765 instruct vmul4L_reg_avx(vec dst, vec src1, vec src2, legVec tmp, legVec tmp1) %{
5766   predicate(Matcher::vector_length(n) == 4 && !VM_Version::supports_avx512dq());
5767   match(Set dst (MulVL src1 src2));
5768   effect(TEMP tmp1, TEMP tmp);
5769   format %{ "vpshufd $tmp,$src2\n\t"
5770             "vpmulld $tmp,$src1,$tmp\n\t"
5771             "vphaddd $tmp,$tmp,$tmp\n\t"
5772             "vpmovzxdq $tmp,$tmp\n\t"
5773             "vpsllq $tmp,$tmp\n\t"
5774             "vpmuludq $tmp1,$src1,$src2\n\t"
5775             "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
5776   ins_encode %{
5777     int vlen_enc = Assembler::AVX_256bit;
5778     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vlen_enc);
5779     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5780     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
5781     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5782     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5783     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vlen_enc);
5784     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5785     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5786   %}
5787   ins_pipe( pipe_slow );
5788 %}
5789 
5790 // Floats vector mul
5791 instruct vmulF(vec dst, vec src) %{
5792   predicate(UseAVX == 0);
5793   match(Set dst (MulVF dst src));
5794   format %{ "mulps   $dst,$src\t! mul packedF" %}
5795   ins_encode %{
5796     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
5797   %}
5798   ins_pipe( pipe_slow );
5799 %}
5800 
5801 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
5802   predicate(UseAVX > 0);
5803   match(Set dst (MulVF src1 src2));
5804   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
5805   ins_encode %{
5806     int vlen_enc = vector_length_encoding(this);
5807     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5808   %}
5809   ins_pipe( pipe_slow );
5810 %}
5811 
5812 instruct vmulF_mem(vec dst, vec src, memory mem) %{
5813   predicate((UseAVX > 0) &&
5814             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5815   match(Set dst (MulVF src (LoadVector mem)));
5816   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
5817   ins_encode %{
5818     int vlen_enc = vector_length_encoding(this);
5819     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5820   %}
5821   ins_pipe( pipe_slow );
5822 %}
5823 
5824 // Doubles vector mul
5825 instruct vmulD(vec dst, vec src) %{
5826   predicate(UseAVX == 0);
5827   match(Set dst (MulVD dst src));
5828   format %{ "mulpd   $dst,$src\t! mul packedD" %}
5829   ins_encode %{
5830     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
5831   %}
5832   ins_pipe( pipe_slow );
5833 %}
5834 
5835 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
5836   predicate(UseAVX > 0);
5837   match(Set dst (MulVD src1 src2));
5838   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
5839   ins_encode %{
5840     int vlen_enc = vector_length_encoding(this);
5841     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5842   %}
5843   ins_pipe( pipe_slow );
5844 %}
5845 
5846 instruct vmulD_mem(vec dst, vec src, memory mem) %{
5847   predicate((UseAVX > 0) &&
5848             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5849   match(Set dst (MulVD src (LoadVector mem)));
5850   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
5851   ins_encode %{
5852     int vlen_enc = vector_length_encoding(this);
5853     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5854   %}
5855   ins_pipe( pipe_slow );
5856 %}
5857 
5858 instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5859   predicate(Matcher::vector_length(n) == 8);
5860   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
5861   effect(TEMP dst, USE src1, USE src2);
5862   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
5863             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
5864          %}
5865   ins_encode %{
5866     assert(UseAVX > 0, "required");
5867 
5868     int vlen_enc = Assembler::AVX_256bit;
5869     int cond = (Assembler::Condition)($copnd$$cmpcode);
5870     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5871     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5872   %}
5873   ins_pipe( pipe_slow );
5874 %}
5875 
5876 instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5877   predicate(Matcher::vector_length(n) == 4);
5878   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
5879   effect(TEMP dst, USE src1, USE src2);
5880   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
5881             "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
5882          %}
5883   ins_encode %{
5884     assert(UseAVX > 0, "required");
5885 
5886     int vlen_enc = Assembler::AVX_256bit;
5887     int cond = (Assembler::Condition)($copnd$$cmpcode);
5888     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5889     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5890   %}
5891   ins_pipe( pipe_slow );
5892 %}
5893 
5894 // --------------------------------- DIV --------------------------------------
5895 
5896 // Floats vector div
5897 instruct vdivF(vec dst, vec src) %{
5898   predicate(UseAVX == 0);
5899   match(Set dst (DivVF dst src));
5900   format %{ "divps   $dst,$src\t! div packedF" %}
5901   ins_encode %{
5902     __ divps($dst$$XMMRegister, $src$$XMMRegister);
5903   %}
5904   ins_pipe( pipe_slow );
5905 %}
5906 
5907 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
5908   predicate(UseAVX > 0);
5909   match(Set dst (DivVF src1 src2));
5910   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
5911   ins_encode %{
5912     int vlen_enc = vector_length_encoding(this);
5913     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5914   %}
5915   ins_pipe( pipe_slow );
5916 %}
5917 
5918 instruct vdivF_mem(vec dst, vec src, memory mem) %{
5919   predicate((UseAVX > 0) &&
5920             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5921   match(Set dst (DivVF src (LoadVector mem)));
5922   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
5923   ins_encode %{
5924     int vlen_enc = vector_length_encoding(this);
5925     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5926   %}
5927   ins_pipe( pipe_slow );
5928 %}
5929 
5930 // Doubles vector div
5931 instruct vdivD(vec dst, vec src) %{
5932   predicate(UseAVX == 0);
5933   match(Set dst (DivVD dst src));
5934   format %{ "divpd   $dst,$src\t! div packedD" %}
5935   ins_encode %{
5936     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
5937   %}
5938   ins_pipe( pipe_slow );
5939 %}
5940 
5941 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
5942   predicate(UseAVX > 0);
5943   match(Set dst (DivVD src1 src2));
5944   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
5945   ins_encode %{
5946     int vlen_enc = vector_length_encoding(this);
5947     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5948   %}
5949   ins_pipe( pipe_slow );
5950 %}
5951 
5952 instruct vdivD_mem(vec dst, vec src, memory mem) %{
5953   predicate((UseAVX > 0) &&
5954             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5955   match(Set dst (DivVD src (LoadVector mem)));
5956   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
5957   ins_encode %{
5958     int vlen_enc = vector_length_encoding(this);
5959     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5960   %}
5961   ins_pipe( pipe_slow );
5962 %}
5963 
5964 // ------------------------------ MinMax ---------------------------------------
5965 
5966 // Byte, Short, Int vector Min/Max
5967 instruct minmax_reg_sse(vec dst, vec src) %{
5968   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5969             UseAVX == 0);
5970   match(Set dst (MinV dst src));
5971   match(Set dst (MaxV dst src));
5972   format %{ "vector_minmax  $dst,$src\t!  " %}
5973   ins_encode %{
5974     assert(UseSSE >= 4, "required");
5975 
5976     int opcode = this->ideal_Opcode();
5977     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5978     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
5979   %}
5980   ins_pipe( pipe_slow );
5981 %}
5982 
5983 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
5984   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5985             UseAVX > 0);
5986   match(Set dst (MinV src1 src2));
5987   match(Set dst (MaxV src1 src2));
5988   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
5989   ins_encode %{
5990     int opcode = this->ideal_Opcode();
5991     int vlen_enc = vector_length_encoding(this);
5992     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5993 
5994     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5995   %}
5996   ins_pipe( pipe_slow );
5997 %}
5998 
5999 // Long vector Min/Max
6000 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
6001   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
6002             UseAVX == 0);
6003   match(Set dst (MinV dst src));
6004   match(Set dst (MaxV src dst));
6005   effect(TEMP dst, TEMP tmp);
6006   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
6007   ins_encode %{
6008     assert(UseSSE >= 4, "required");
6009 
6010     int opcode = this->ideal_Opcode();
6011     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6012     assert(elem_bt == T_LONG, "sanity");
6013 
6014     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
6015   %}
6016   ins_pipe( pipe_slow );
6017 %}
6018 
6019 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
6020   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
6021             UseAVX > 0 && !VM_Version::supports_avx512vl());
6022   match(Set dst (MinV src1 src2));
6023   match(Set dst (MaxV src1 src2));
6024   effect(TEMP dst);
6025   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
6026   ins_encode %{
6027     int vlen_enc = vector_length_encoding(this);
6028     int opcode = this->ideal_Opcode();
6029     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6030     assert(elem_bt == T_LONG, "sanity");
6031 
6032     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6033   %}
6034   ins_pipe( pipe_slow );
6035 %}
6036 
6037 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
6038   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
6039             Matcher::vector_element_basic_type(n) == T_LONG);
6040   match(Set dst (MinV src1 src2));
6041   match(Set dst (MaxV src1 src2));
6042   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
6043   ins_encode %{
6044     assert(UseAVX > 2, "required");
6045 
6046     int vlen_enc = vector_length_encoding(this);
6047     int opcode = this->ideal_Opcode();
6048     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6049     assert(elem_bt == T_LONG, "sanity");
6050 
6051     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6052   %}
6053   ins_pipe( pipe_slow );
6054 %}
6055 
6056 // Float/Double vector Min/Max
6057 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
6058   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
6059             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
6060             UseAVX > 0);
6061   match(Set dst (MinV a b));
6062   match(Set dst (MaxV a b));
6063   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
6064   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
6065   ins_encode %{
6066     assert(UseAVX > 0, "required");
6067 
6068     int opcode = this->ideal_Opcode();
6069     int vlen_enc = vector_length_encoding(this);
6070     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6071 
6072     __ vminmax_fp(opcode, elem_bt,
6073                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
6074                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
6075   %}
6076   ins_pipe( pipe_slow );
6077 %}
6078 
6079 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
6080   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
6081             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
6082   match(Set dst (MinV a b));
6083   match(Set dst (MaxV a b));
6084   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
6085   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
6086   ins_encode %{
6087     assert(UseAVX > 2, "required");
6088 
6089     int opcode = this->ideal_Opcode();
6090     int vlen_enc = vector_length_encoding(this);
6091     BasicType elem_bt = Matcher::vector_element_basic_type(this);
6092 
6093     __ evminmax_fp(opcode, elem_bt,
6094                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
6095                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
6096   %}
6097   ins_pipe( pipe_slow );
6098 %}
6099 
6100 // --------------------------------- Signum/CopySign ---------------------------
6101 
6102 instruct signumF_reg(regF dst, regF zero, regF one, rRegP scratch, rFlagsReg cr) %{
6103   match(Set dst (SignumF dst (Binary zero one)));
6104   effect(TEMP scratch, KILL cr);
6105   format %{ "signumF $dst, $dst\t! using $scratch as TEMP" %}
6106   ins_encode %{
6107     int opcode = this->ideal_Opcode();
6108     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
6109   %}
6110   ins_pipe( pipe_slow );
6111 %}
6112 
6113 instruct signumD_reg(regD dst, regD zero, regD one, rRegP scratch, rFlagsReg cr) %{
6114   match(Set dst (SignumD dst (Binary zero one)));
6115   effect(TEMP scratch, KILL cr);
6116   format %{ "signumD $dst, $dst\t! using $scratch as TEMP" %}
6117   ins_encode %{
6118     int opcode = this->ideal_Opcode();
6119     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
6120   %}
6121   ins_pipe( pipe_slow );
6122 %}
6123 
6124 // ---------------------------------------
6125 // For copySign use 0xE4 as writemask for vpternlog
6126 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
6127 // C (xmm2) is set to 0x7FFFFFFF
6128 // Wherever xmm2 is 0, we want to pick from B (sign)
6129 // Wherever xmm2 is 1, we want to pick from A (src)
6130 //
6131 // A B C Result
6132 // 0 0 0 0
6133 // 0 0 1 0
6134 // 0 1 0 1
6135 // 0 1 1 0
6136 // 1 0 0 0
6137 // 1 0 1 1
6138 // 1 1 0 1
6139 // 1 1 1 1
6140 //
6141 // Result going from high bit to low bit is 0x11100100 = 0xe4
6142 // ---------------------------------------
6143 
6144 #ifdef _LP64
6145 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
6146   match(Set dst (CopySignF dst src));
6147   effect(TEMP tmp1, TEMP tmp2);
6148   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
6149   ins_encode %{
6150     __ movl($tmp2$$Register, 0x7FFFFFFF);
6151     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
6152     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
6153   %}
6154   ins_pipe( pipe_slow );
6155 %}
6156 
6157 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
6158   match(Set dst (CopySignD dst (Binary src zero)));
6159   ins_cost(100);
6160   effect(TEMP tmp1, TEMP tmp2);
6161   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
6162   ins_encode %{
6163     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
6164     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
6165     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
6166   %}
6167   ins_pipe( pipe_slow );
6168 %}
6169 #endif // _LP64
6170 
6171 // --------------------------------- Sqrt --------------------------------------
6172 
6173 instruct vsqrtF_reg(vec dst, vec src) %{
6174   match(Set dst (SqrtVF src));
6175   ins_cost(400);
6176   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
6177   ins_encode %{
6178     assert(UseAVX > 0, "required");
6179     int vlen_enc = vector_length_encoding(this);
6180     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6181   %}
6182   ins_pipe( pipe_slow );
6183 %}
6184 
6185 instruct vsqrtF_mem(vec dst, memory mem) %{
6186   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
6187   match(Set dst (SqrtVF (LoadVector mem)));
6188   ins_cost(400);
6189   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
6190   ins_encode %{
6191     assert(UseAVX > 0, "required");
6192     int vlen_enc = vector_length_encoding(this);
6193     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
6194   %}
6195   ins_pipe( pipe_slow );
6196 %}
6197 
6198 // Floating point vector sqrt
6199 instruct vsqrtD_reg(vec dst, vec src) %{
6200   match(Set dst (SqrtVD src));
6201   ins_cost(400);
6202   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
6203   ins_encode %{
6204     assert(UseAVX > 0, "required");
6205     int vlen_enc = vector_length_encoding(this);
6206     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6207   %}
6208   ins_pipe( pipe_slow );
6209 %}
6210 
6211 instruct vsqrtD_mem(vec dst, memory mem) %{
6212   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
6213   match(Set dst (SqrtVD (LoadVector mem)));
6214   ins_cost(400);
6215   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
6216   ins_encode %{
6217     assert(UseAVX > 0, "required");
6218     int vlen_enc = vector_length_encoding(this);
6219     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
6220   %}
6221   ins_pipe( pipe_slow );
6222 %}
6223 
6224 // ------------------------------ Shift ---------------------------------------
6225 
6226 // Left and right shift count vectors are the same on x86
6227 // (only lowest bits of xmm reg are used for count).
6228 instruct vshiftcnt(vec dst, rRegI cnt) %{
6229   match(Set dst (LShiftCntV cnt));
6230   match(Set dst (RShiftCntV cnt));
6231   format %{ "movdl    $dst,$cnt\t! load shift count" %}
6232   ins_encode %{
6233     __ movdl($dst$$XMMRegister, $cnt$$Register);
6234   %}
6235   ins_pipe( pipe_slow );
6236 %}
6237 
6238 // Byte vector shift
6239 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6240   predicate(Matcher::vector_length(n) <= 8 && !n->as_ShiftV()->is_var_shift());
6241   match(Set dst ( LShiftVB src shift));
6242   match(Set dst ( RShiftVB src shift));
6243   match(Set dst (URShiftVB src shift));
6244   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
6245   format %{"vector_byte_shift $dst,$src,$shift" %}
6246   ins_encode %{
6247     assert(UseSSE > 3, "required");
6248     int opcode = this->ideal_Opcode();
6249     bool sign = (opcode != Op_URShiftVB);
6250     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
6251     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
6252     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6253     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
6254     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
6255   %}
6256   ins_pipe( pipe_slow );
6257 %}
6258 
6259 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
6260   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
6261             UseAVX <= 1);
6262   match(Set dst ( LShiftVB src shift));
6263   match(Set dst ( RShiftVB src shift));
6264   match(Set dst (URShiftVB src shift));
6265   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
6266   format %{"vector_byte_shift $dst,$src,$shift" %}
6267   ins_encode %{
6268     assert(UseSSE > 3, "required");
6269     int opcode = this->ideal_Opcode();
6270     bool sign = (opcode != Op_URShiftVB);
6271     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
6272     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
6273     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
6274     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
6275     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
6276     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6277     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
6278     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
6279     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
6280   %}
6281   ins_pipe( pipe_slow );
6282 %}
6283 
6284 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6285   predicate(Matcher::vector_length(n) == 16 && !n->as_ShiftV()->is_var_shift() &&
6286             UseAVX > 1);
6287   match(Set dst ( LShiftVB src shift));
6288   match(Set dst ( RShiftVB src shift));
6289   match(Set dst (URShiftVB src shift));
6290   effect(TEMP dst, TEMP tmp, TEMP scratch);
6291   format %{"vector_byte_shift $dst,$src,$shift" %}
6292   ins_encode %{
6293     int opcode = this->ideal_Opcode();
6294     bool sign = (opcode != Op_URShiftVB);
6295     int vlen_enc = Assembler::AVX_256bit;
6296     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
6297     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6298     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6299     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
6300     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
6301   %}
6302   ins_pipe( pipe_slow );
6303 %}
6304 
6305 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6306   predicate(Matcher::vector_length(n) == 32 && !n->as_ShiftV()->is_var_shift());
6307   match(Set dst ( LShiftVB src shift));
6308   match(Set dst ( RShiftVB src shift));
6309   match(Set dst (URShiftVB src shift));
6310   effect(TEMP dst, TEMP tmp, TEMP scratch);
6311   format %{"vector_byte_shift $dst,$src,$shift" %}
6312   ins_encode %{
6313     assert(UseAVX > 1, "required");
6314     int opcode = this->ideal_Opcode();
6315     bool sign = (opcode != Op_URShiftVB);
6316     int vlen_enc = Assembler::AVX_256bit;
6317     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
6318     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6319     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6320     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6321     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6322     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6323     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6324     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6325     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6326   %}
6327   ins_pipe( pipe_slow );
6328 %}
6329 
6330 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
6331   predicate(Matcher::vector_length(n) == 64 && !n->as_ShiftV()->is_var_shift());
6332   match(Set dst ( LShiftVB src shift));
6333   match(Set dst  (RShiftVB src shift));
6334   match(Set dst (URShiftVB src shift));
6335   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
6336   format %{"vector_byte_shift $dst,$src,$shift" %}
6337   ins_encode %{
6338     assert(UseAVX > 2, "required");
6339     int opcode = this->ideal_Opcode();
6340     bool sign = (opcode != Op_URShiftVB);
6341     int vlen_enc = Assembler::AVX_512bit;
6342     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
6343     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
6344     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6345     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6346     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6347     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6348     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6349     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6350     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6351     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
6352     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
6353     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6354   %}
6355   ins_pipe( pipe_slow );
6356 %}
6357 
6358 // Shorts vector logical right shift produces incorrect Java result
6359 // for negative data because java code convert short value into int with
6360 // sign extension before a shift. But char vectors are fine since chars are
6361 // unsigned values.
6362 // Shorts/Chars vector left shift
6363 instruct vshiftS(vec dst, vec src, vec shift) %{
6364   predicate(!n->as_ShiftV()->is_var_shift());
6365   match(Set dst ( LShiftVS src shift));
6366   match(Set dst ( RShiftVS src shift));
6367   match(Set dst (URShiftVS src shift));
6368   effect(TEMP dst, USE src, USE shift);
6369   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
6370   ins_encode %{
6371     int opcode = this->ideal_Opcode();
6372     if (UseAVX > 0) {
6373       int vlen_enc = vector_length_encoding(this);
6374       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6375     } else {
6376       int vlen = Matcher::vector_length(this);
6377       if (vlen == 2) {
6378         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6379         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6380       } else if (vlen == 4) {
6381         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6382         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6383       } else {
6384         assert (vlen == 8, "sanity");
6385         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6386         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6387       }
6388     }
6389   %}
6390   ins_pipe( pipe_slow );
6391 %}
6392 
6393 // Integers vector left shift
6394 instruct vshiftI(vec dst, vec src, vec shift) %{
6395   predicate(!n->as_ShiftV()->is_var_shift());
6396   match(Set dst ( LShiftVI src shift));
6397   match(Set dst ( RShiftVI src shift));
6398   match(Set dst (URShiftVI src shift));
6399   effect(TEMP dst, USE src, USE shift);
6400   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
6401   ins_encode %{
6402     int opcode = this->ideal_Opcode();
6403     if (UseAVX > 0) {
6404       int vlen_enc = vector_length_encoding(this);
6405       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6406     } else {
6407       int vlen = Matcher::vector_length(this);
6408       if (vlen == 2) {
6409         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6410         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6411       } else {
6412         assert(vlen == 4, "sanity");
6413         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6414         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6415       }
6416     }
6417   %}
6418   ins_pipe( pipe_slow );
6419 %}
6420 
6421 // Integers vector left constant shift
6422 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
6423   match(Set dst (LShiftVI src (LShiftCntV shift)));
6424   match(Set dst (RShiftVI src (RShiftCntV shift)));
6425   match(Set dst (URShiftVI src (RShiftCntV shift)));
6426   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
6427   ins_encode %{
6428     int opcode = this->ideal_Opcode();
6429     if (UseAVX > 0) {
6430       int vector_len = vector_length_encoding(this);
6431       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6432     } else {
6433       int vlen = Matcher::vector_length(this);
6434       if (vlen == 2) {
6435         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6436         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6437       } else {
6438         assert(vlen == 4, "sanity");
6439         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6440         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6441       }
6442     }
6443   %}
6444   ins_pipe( pipe_slow );
6445 %}
6446 
6447 // Longs vector shift
6448 instruct vshiftL(vec dst, vec src, vec shift) %{
6449   predicate(!n->as_ShiftV()->is_var_shift());
6450   match(Set dst ( LShiftVL src shift));
6451   match(Set dst (URShiftVL src shift));
6452   effect(TEMP dst, USE src, USE shift);
6453   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
6454   ins_encode %{
6455     int opcode = this->ideal_Opcode();
6456     if (UseAVX > 0) {
6457       int vlen_enc = vector_length_encoding(this);
6458       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6459     } else {
6460       assert(Matcher::vector_length(this) == 2, "");
6461       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6462       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6463     }
6464   %}
6465   ins_pipe( pipe_slow );
6466 %}
6467 
6468 // Longs vector constant shift
6469 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
6470   match(Set dst (LShiftVL src (LShiftCntV shift)));
6471   match(Set dst (URShiftVL src (RShiftCntV shift)));
6472   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
6473   ins_encode %{
6474     int opcode = this->ideal_Opcode();
6475     if (UseAVX > 0) {
6476       int vector_len = vector_length_encoding(this);
6477       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6478     } else {
6479       assert(Matcher::vector_length(this) == 2, "");
6480       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6481       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6482     }
6483   %}
6484   ins_pipe( pipe_slow );
6485 %}
6486 
6487 // -------------------ArithmeticRightShift -----------------------------------
6488 // Long vector arithmetic right shift
6489 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6490   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX <= 2);
6491   match(Set dst (RShiftVL src shift));
6492   effect(TEMP dst, TEMP tmp, TEMP scratch);
6493   format %{ "vshiftq $dst,$src,$shift" %}
6494   ins_encode %{
6495     uint vlen = Matcher::vector_length(this);
6496     if (vlen == 2) {
6497       assert(UseSSE >= 2, "required");
6498       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6499       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
6500       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6501       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
6502       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
6503       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
6504     } else {
6505       assert(vlen == 4, "sanity");
6506       assert(UseAVX > 1, "required");
6507       int vlen_enc = Assembler::AVX_256bit;
6508       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6509       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6510       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6511       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6512       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6513     }
6514   %}
6515   ins_pipe( pipe_slow );
6516 %}
6517 
6518 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
6519   predicate(!n->as_ShiftV()->is_var_shift() && UseAVX > 2);
6520   match(Set dst (RShiftVL src shift));
6521   format %{ "vshiftq $dst,$src,$shift" %}
6522   ins_encode %{
6523     int vlen_enc = vector_length_encoding(this);
6524     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6525   %}
6526   ins_pipe( pipe_slow );
6527 %}
6528 
6529 // ------------------- Variable Shift -----------------------------
6530 // Byte variable shift
6531 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6532   predicate(Matcher::vector_length(n) <= 8 &&
6533             n->as_ShiftV()->is_var_shift() &&
6534             !VM_Version::supports_avx512bw());
6535   match(Set dst ( LShiftVB src shift));
6536   match(Set dst ( RShiftVB src shift));
6537   match(Set dst (URShiftVB src shift));
6538   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6539   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6540   ins_encode %{
6541     assert(UseAVX >= 2, "required");
6542 
6543     int opcode = this->ideal_Opcode();
6544     int vlen_enc = Assembler::AVX_128bit;
6545     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6546     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6547   %}
6548   ins_pipe( pipe_slow );
6549 %}
6550 
6551 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6552   predicate(Matcher::vector_length(n) == 16 &&
6553             n->as_ShiftV()->is_var_shift() &&
6554             !VM_Version::supports_avx512bw());
6555   match(Set dst ( LShiftVB src shift));
6556   match(Set dst ( RShiftVB src shift));
6557   match(Set dst (URShiftVB src shift));
6558   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6559   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6560   ins_encode %{
6561     assert(UseAVX >= 2, "required");
6562 
6563     int opcode = this->ideal_Opcode();
6564     int vlen_enc = Assembler::AVX_128bit;
6565     // Shift lower half and get word result in dst
6566     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6567 
6568     // Shift upper half and get word result in vtmp1
6569     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6570     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6571     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6572 
6573     // Merge and down convert the two word results to byte in dst
6574     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6575   %}
6576   ins_pipe( pipe_slow );
6577 %}
6578 
6579 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{
6580   predicate(Matcher::vector_length(n) == 32 &&
6581             n->as_ShiftV()->is_var_shift() &&
6582             !VM_Version::supports_avx512bw());
6583   match(Set dst ( LShiftVB src shift));
6584   match(Set dst ( RShiftVB src shift));
6585   match(Set dst (URShiftVB src shift));
6586   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch);
6587   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %}
6588   ins_encode %{
6589     assert(UseAVX >= 2, "required");
6590 
6591     int opcode = this->ideal_Opcode();
6592     int vlen_enc = Assembler::AVX_128bit;
6593     // Process lower 128 bits and get result in dst
6594     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6595     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6596     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6597     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6598     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6599 
6600     // Process higher 128 bits and get result in vtmp3
6601     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6602     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6603     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register);
6604     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
6605     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
6606     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6607     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
6608 
6609     // Merge the two results in dst
6610     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6611   %}
6612   ins_pipe( pipe_slow );
6613 %}
6614 
6615 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6616   predicate(Matcher::vector_length(n) <= 32 &&
6617             n->as_ShiftV()->is_var_shift() &&
6618             VM_Version::supports_avx512bw());
6619   match(Set dst ( LShiftVB src shift));
6620   match(Set dst ( RShiftVB src shift));
6621   match(Set dst (URShiftVB src shift));
6622   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6623   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6624   ins_encode %{
6625     assert(UseAVX > 2, "required");
6626 
6627     int opcode = this->ideal_Opcode();
6628     int vlen_enc = vector_length_encoding(this);
6629     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6630   %}
6631   ins_pipe( pipe_slow );
6632 %}
6633 
6634 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6635   predicate(Matcher::vector_length(n) == 64 &&
6636             n->as_ShiftV()->is_var_shift() &&
6637             VM_Version::supports_avx512bw());
6638   match(Set dst ( LShiftVB src shift));
6639   match(Set dst ( RShiftVB src shift));
6640   match(Set dst (URShiftVB src shift));
6641   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6642   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6643   ins_encode %{
6644     assert(UseAVX > 2, "required");
6645 
6646     int opcode = this->ideal_Opcode();
6647     int vlen_enc = Assembler::AVX_256bit;
6648     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6649     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6650     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6651     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6652     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6653   %}
6654   ins_pipe( pipe_slow );
6655 %}
6656 
6657 // Short variable shift
6658 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6659   predicate(Matcher::vector_length(n) <= 8 &&
6660             n->as_ShiftV()->is_var_shift() &&
6661             !VM_Version::supports_avx512bw());
6662   match(Set dst ( LShiftVS src shift));
6663   match(Set dst ( RShiftVS src shift));
6664   match(Set dst (URShiftVS src shift));
6665   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6666   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6667   ins_encode %{
6668     assert(UseAVX >= 2, "required");
6669 
6670     int opcode = this->ideal_Opcode();
6671     bool sign = (opcode != Op_URShiftVS);
6672     int vlen_enc = Assembler::AVX_256bit;
6673     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
6674     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
6675     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
6676     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6677     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
6678     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6679   %}
6680   ins_pipe( pipe_slow );
6681 %}
6682 
6683 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6684   predicate(Matcher::vector_length(n) == 16 &&
6685             n->as_ShiftV()->is_var_shift() &&
6686             !VM_Version::supports_avx512bw());
6687   match(Set dst ( LShiftVS src shift));
6688   match(Set dst ( RShiftVS src shift));
6689   match(Set dst (URShiftVS src shift));
6690   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6691   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6692   ins_encode %{
6693     assert(UseAVX >= 2, "required");
6694 
6695     int opcode = this->ideal_Opcode();
6696     bool sign = (opcode != Op_URShiftVS);
6697     int vlen_enc = Assembler::AVX_256bit;
6698     // Shift lower half, with result in vtmp2 usign vtmp1 as TEMP
6699     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6700     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6701     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6702     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6703 
6704     // Shift upper half, with result in dst usign vtmp1 as TEMP
6705     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
6706     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
6707     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6708     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6709     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6710     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6711 
6712     // Merge lower and upper half result into dst
6713     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6714     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6715   %}
6716   ins_pipe( pipe_slow );
6717 %}
6718 
6719 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
6720   predicate(n->as_ShiftV()->is_var_shift() &&
6721             VM_Version::supports_avx512bw());
6722   match(Set dst ( LShiftVS src shift));
6723   match(Set dst ( RShiftVS src shift));
6724   match(Set dst (URShiftVS src shift));
6725   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
6726   ins_encode %{
6727     assert(UseAVX > 2, "required");
6728 
6729     int opcode = this->ideal_Opcode();
6730     int vlen_enc = vector_length_encoding(this);
6731     if (!VM_Version::supports_avx512vl()) {
6732       vlen_enc = Assembler::AVX_512bit;
6733     }
6734     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6735   %}
6736   ins_pipe( pipe_slow );
6737 %}
6738 
6739 //Integer variable shift
6740 instruct vshiftI_var(vec dst, vec src, vec shift) %{
6741   predicate(n->as_ShiftV()->is_var_shift());
6742   match(Set dst ( LShiftVI src shift));
6743   match(Set dst ( RShiftVI src shift));
6744   match(Set dst (URShiftVI src shift));
6745   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
6746   ins_encode %{
6747     assert(UseAVX >= 2, "required");
6748 
6749     int opcode = this->ideal_Opcode();
6750     int vlen_enc = vector_length_encoding(this);
6751     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6752   %}
6753   ins_pipe( pipe_slow );
6754 %}
6755 
6756 //Long variable shift
6757 instruct vshiftL_var(vec dst, vec src, vec shift) %{
6758   predicate(n->as_ShiftV()->is_var_shift());
6759   match(Set dst ( LShiftVL src shift));
6760   match(Set dst (URShiftVL src shift));
6761   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
6762   ins_encode %{
6763     assert(UseAVX >= 2, "required");
6764 
6765     int opcode = this->ideal_Opcode();
6766     int vlen_enc = vector_length_encoding(this);
6767     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6768   %}
6769   ins_pipe( pipe_slow );
6770 %}
6771 
6772 //Long variable right shift arithmetic
6773 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
6774   predicate(Matcher::vector_length(n) <= 4 &&
6775             n->as_ShiftV()->is_var_shift() &&
6776             UseAVX == 2);
6777   match(Set dst (RShiftVL src shift));
6778   effect(TEMP dst, TEMP vtmp);
6779   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
6780   ins_encode %{
6781     int opcode = this->ideal_Opcode();
6782     int vlen_enc = vector_length_encoding(this);
6783     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
6784                  $vtmp$$XMMRegister);
6785   %}
6786   ins_pipe( pipe_slow );
6787 %}
6788 
6789 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
6790   predicate(n->as_ShiftV()->is_var_shift() &&
6791             UseAVX > 2);
6792   match(Set dst (RShiftVL src shift));
6793   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
6794   ins_encode %{
6795     int opcode = this->ideal_Opcode();
6796     int vlen_enc = vector_length_encoding(this);
6797     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6798   %}
6799   ins_pipe( pipe_slow );
6800 %}
6801 
6802 // --------------------------------- AND --------------------------------------
6803 
6804 instruct vand(vec dst, vec src) %{
6805   predicate(UseAVX == 0);
6806   match(Set dst (AndV dst src));
6807   format %{ "pand    $dst,$src\t! and vectors" %}
6808   ins_encode %{
6809     __ pand($dst$$XMMRegister, $src$$XMMRegister);
6810   %}
6811   ins_pipe( pipe_slow );
6812 %}
6813 
6814 instruct vand_reg(vec dst, vec src1, vec src2) %{
6815   predicate(UseAVX > 0);
6816   match(Set dst (AndV src1 src2));
6817   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
6818   ins_encode %{
6819     int vlen_enc = vector_length_encoding(this);
6820     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6821   %}
6822   ins_pipe( pipe_slow );
6823 %}
6824 
6825 instruct vand_mem(vec dst, vec src, memory mem) %{
6826   predicate((UseAVX > 0) &&
6827             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6828   match(Set dst (AndV src (LoadVector mem)));
6829   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
6830   ins_encode %{
6831     int vlen_enc = vector_length_encoding(this);
6832     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6833   %}
6834   ins_pipe( pipe_slow );
6835 %}
6836 
6837 // --------------------------------- OR ---------------------------------------
6838 
6839 instruct vor(vec dst, vec src) %{
6840   predicate(UseAVX == 0);
6841   match(Set dst (OrV dst src));
6842   format %{ "por     $dst,$src\t! or vectors" %}
6843   ins_encode %{
6844     __ por($dst$$XMMRegister, $src$$XMMRegister);
6845   %}
6846   ins_pipe( pipe_slow );
6847 %}
6848 
6849 instruct vor_reg(vec dst, vec src1, vec src2) %{
6850   predicate(UseAVX > 0);
6851   match(Set dst (OrV src1 src2));
6852   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
6853   ins_encode %{
6854     int vlen_enc = vector_length_encoding(this);
6855     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6856   %}
6857   ins_pipe( pipe_slow );
6858 %}
6859 
6860 instruct vor_mem(vec dst, vec src, memory mem) %{
6861   predicate((UseAVX > 0) &&
6862             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6863   match(Set dst (OrV src (LoadVector mem)));
6864   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
6865   ins_encode %{
6866     int vlen_enc = vector_length_encoding(this);
6867     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6868   %}
6869   ins_pipe( pipe_slow );
6870 %}
6871 
6872 // --------------------------------- XOR --------------------------------------
6873 
6874 instruct vxor(vec dst, vec src) %{
6875   predicate(UseAVX == 0);
6876   match(Set dst (XorV dst src));
6877   format %{ "pxor    $dst,$src\t! xor vectors" %}
6878   ins_encode %{
6879     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
6880   %}
6881   ins_pipe( pipe_slow );
6882 %}
6883 
6884 instruct vxor_reg(vec dst, vec src1, vec src2) %{
6885   predicate(UseAVX > 0);
6886   match(Set dst (XorV src1 src2));
6887   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
6888   ins_encode %{
6889     int vlen_enc = vector_length_encoding(this);
6890     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6891   %}
6892   ins_pipe( pipe_slow );
6893 %}
6894 
6895 instruct vxor_mem(vec dst, vec src, memory mem) %{
6896   predicate((UseAVX > 0) &&
6897             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6898   match(Set dst (XorV src (LoadVector mem)));
6899   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
6900   ins_encode %{
6901     int vlen_enc = vector_length_encoding(this);
6902     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6903   %}
6904   ins_pipe( pipe_slow );
6905 %}
6906 
6907 // --------------------------------- VectorCast --------------------------------------
6908 
6909 instruct vcastBtoX(vec dst, vec src) %{
6910   match(Set dst (VectorCastB2X src));
6911   format %{ "vector_cast_b2x $dst,$src\t!" %}
6912   ins_encode %{
6913     assert(UseAVX > 0, "required");
6914 
6915     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6916     int vlen_enc = vector_length_encoding(this);
6917     switch (to_elem_bt) {
6918       case T_SHORT:
6919         __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6920         break;
6921       case T_INT:
6922         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6923         break;
6924       case T_FLOAT:
6925         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6926         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6927         break;
6928       case T_LONG:
6929         __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6930         break;
6931       case T_DOUBLE: {
6932         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
6933         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
6934         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6935         break;
6936       }
6937       default: assert(false, "%s", type2name(to_elem_bt));
6938     }
6939   %}
6940   ins_pipe( pipe_slow );
6941 %}
6942 
6943 instruct castStoX(vec dst, vec src, rRegP scratch) %{
6944   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6945             Matcher::vector_length(n->in(1)) <= 8 && // src
6946             Matcher::vector_element_basic_type(n) == T_BYTE);
6947   effect(TEMP scratch);
6948   match(Set dst (VectorCastS2X src));
6949   format %{ "vector_cast_s2x $dst,$src\t! using $scratch as TEMP" %}
6950   ins_encode %{
6951     assert(UseAVX > 0, "required");
6952 
6953     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, $scratch$$Register);
6954     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6955   %}
6956   ins_pipe( pipe_slow );
6957 %}
6958 
6959 instruct vcastStoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6960   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6961             Matcher::vector_length(n->in(1)) == 16 && // src
6962             Matcher::vector_element_basic_type(n) == T_BYTE);
6963   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6964   match(Set dst (VectorCastS2X src));
6965   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp, $scratch as TEMP" %}
6966   ins_encode %{
6967     assert(UseAVX > 0, "required");
6968 
6969     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
6970     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6971     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
6972     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6973   %}
6974   ins_pipe( pipe_slow );
6975 %}
6976 
6977 instruct vcastStoX_evex(vec dst, vec src) %{
6978   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
6979             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
6980   match(Set dst (VectorCastS2X src));
6981   format %{ "vector_cast_s2x $dst,$src\t!" %}
6982   ins_encode %{
6983     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6984     int src_vlen_enc = vector_length_encoding(this, $src);
6985     int vlen_enc = vector_length_encoding(this);
6986     switch (to_elem_bt) {
6987       case T_BYTE:
6988         if (!VM_Version::supports_avx512vl()) {
6989           vlen_enc = Assembler::AVX_512bit;
6990         }
6991         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6992         break;
6993       case T_INT:
6994         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6995         break;
6996       case T_FLOAT:
6997         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6998         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6999         break;
7000       case T_LONG:
7001         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7002         break;
7003       case T_DOUBLE: {
7004         int mid_vlen_enc = (vlen_enc == Assembler::AVX_512bit) ? Assembler::AVX_256bit : Assembler::AVX_128bit;
7005         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, mid_vlen_enc);
7006         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7007         break;
7008       }
7009       default:
7010         ShouldNotReachHere();
7011     }
7012   %}
7013   ins_pipe( pipe_slow );
7014 %}
7015 
7016 instruct castItoX(vec dst, vec src, rRegP scratch) %{
7017   predicate(UseAVX <= 2 &&
7018             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
7019             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
7020   match(Set dst (VectorCastI2X src));
7021   format %{ "vector_cast_i2x $dst,$src\t! using $scratch as TEMP" %}
7022   effect(TEMP scratch);
7023   ins_encode %{
7024     assert(UseAVX > 0, "required");
7025 
7026     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7027     int vlen_enc = vector_length_encoding(this, $src);
7028 
7029     if (to_elem_bt == T_BYTE) {
7030       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
7031       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7032       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7033     } else {
7034       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
7035       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
7036       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7037     }
7038   %}
7039   ins_pipe( pipe_slow );
7040 %}
7041 
7042 instruct vcastItoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
7043   predicate(UseAVX <= 2 &&
7044             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
7045             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
7046   match(Set dst (VectorCastI2X src));
7047   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp and $scratch as TEMP" %}
7048   effect(TEMP dst, TEMP vtmp, TEMP scratch);
7049   ins_encode %{
7050     assert(UseAVX > 0, "required");
7051 
7052     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7053     int vlen_enc = vector_length_encoding(this, $src);
7054 
7055     if (to_elem_bt == T_BYTE) {
7056       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
7057       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
7058       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7059       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7060     } else {
7061       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
7062       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
7063       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
7064       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7065     }
7066   %}
7067   ins_pipe( pipe_slow );
7068 %}
7069 
7070 instruct vcastItoX_evex(vec dst, vec src) %{
7071   predicate(UseAVX > 2 ||
7072             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
7073   match(Set dst (VectorCastI2X src));
7074   format %{ "vector_cast_i2x $dst,$src\t!" %}
7075   ins_encode %{
7076     assert(UseAVX > 0, "required");
7077 
7078     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
7079     int src_vlen_enc = vector_length_encoding(this, $src);
7080     int dst_vlen_enc = vector_length_encoding(this);
7081     switch (dst_elem_bt) {
7082       case T_BYTE:
7083         if (!VM_Version::supports_avx512vl()) {
7084           src_vlen_enc = Assembler::AVX_512bit;
7085         }
7086         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7087         break;
7088       case T_SHORT:
7089         if (!VM_Version::supports_avx512vl()) {
7090           src_vlen_enc = Assembler::AVX_512bit;
7091         }
7092         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7093         break;
7094       case T_FLOAT:
7095         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7096         break;
7097       case T_LONG:
7098         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7099         break;
7100       case T_DOUBLE:
7101         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
7102         break;
7103       default:
7104         ShouldNotReachHere();
7105     }
7106   %}
7107   ins_pipe( pipe_slow );
7108 %}
7109 
7110 instruct vcastLtoBS(vec dst, vec src, rRegP scratch) %{
7111   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
7112             UseAVX <= 2);
7113   match(Set dst (VectorCastL2X src));
7114   effect(TEMP scratch);
7115   format %{ "vector_cast_l2x  $dst,$src\t! using $scratch as TEMP" %}
7116   ins_encode %{
7117     assert(UseAVX > 0, "required");
7118 
7119     int vlen = Matcher::vector_length_in_bytes(this, $src);
7120     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
7121     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
7122                                                       : ExternalAddress(vector_int_to_short_mask());
7123     if (vlen <= 16) {
7124       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
7125       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
7126       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7127     } else {
7128       assert(vlen <= 32, "required");
7129       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
7130       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
7131       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
7132       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7133     }
7134     if (to_elem_bt == T_BYTE) {
7135       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
7136     }
7137   %}
7138   ins_pipe( pipe_slow );
7139 %}
7140 
7141 instruct vcastLtoX_evex(vec dst, vec src) %{
7142   predicate(UseAVX > 2 ||
7143             (Matcher::vector_element_basic_type(n) == T_INT ||
7144              Matcher::vector_element_basic_type(n) == T_FLOAT ||
7145              Matcher::vector_element_basic_type(n) == T_DOUBLE));
7146   match(Set dst (VectorCastL2X src));
7147   format %{ "vector_cast_l2x  $dst,$src\t!" %}
7148   ins_encode %{
7149     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7150     int vlen = Matcher::vector_length_in_bytes(this, $src);
7151     int vlen_enc = vector_length_encoding(this, $src);
7152     switch (to_elem_bt) {
7153       case T_BYTE:
7154         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
7155           vlen_enc = Assembler::AVX_512bit;
7156         }
7157         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7158         break;
7159       case T_SHORT:
7160         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
7161           vlen_enc = Assembler::AVX_512bit;
7162         }
7163         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7164         break;
7165       case T_INT:
7166         if (vlen == 8) {
7167           if ($dst$$XMMRegister != $src$$XMMRegister) {
7168             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
7169           }
7170         } else if (vlen == 16) {
7171           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
7172         } else if (vlen == 32) {
7173           if (UseAVX > 2) {
7174             if (!VM_Version::supports_avx512vl()) {
7175               vlen_enc = Assembler::AVX_512bit;
7176             }
7177             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7178           } else {
7179             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
7180             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
7181           }
7182         } else { // vlen == 64
7183           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7184         }
7185         break;
7186       case T_FLOAT:
7187         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
7188         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7189         break;
7190       case T_DOUBLE:
7191         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
7192         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7193         break;
7194 
7195       default: assert(false, "%s", type2name(to_elem_bt));
7196     }
7197   %}
7198   ins_pipe( pipe_slow );
7199 %}
7200 
7201 instruct vcastFtoD_reg(vec dst, vec src) %{
7202   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
7203   match(Set dst (VectorCastF2X src));
7204   format %{ "vector_cast_f2d  $dst,$src\t!" %}
7205   ins_encode %{
7206     int vlen_enc = vector_length_encoding(this);
7207     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7208   %}
7209   ins_pipe( pipe_slow );
7210 %}
7211 
7212 instruct vcastFtoI_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, vec xtmp4, rRegP scratch, rFlagsReg cr) %{
7213   predicate(!VM_Version::supports_avx512vl() &&
7214             Matcher::vector_length_in_bytes(n) < 64 &&
7215             Matcher::vector_element_basic_type(n) == T_INT);
7216   match(Set dst (VectorCastF2X src));
7217   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP xtmp4, TEMP scratch, KILL cr);
7218   format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $xtmp3 and $xtmp4 as TEMP" %}
7219   ins_encode %{
7220     int vlen_enc = vector_length_encoding(this);
7221     __ vector_castF2I_avx($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7222                           $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $xtmp4$$XMMRegister,
7223                           ExternalAddress(vector_float_signflip()), $scratch$$Register, vlen_enc);
7224   %}
7225   ins_pipe( pipe_slow );
7226 %}
7227 
7228 instruct vcastFtoI_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
7229   predicate((VM_Version::supports_avx512vl() ||
7230              Matcher::vector_length_in_bytes(n) == 64) &&
7231              Matcher::vector_element_basic_type(n) == T_INT);
7232   match(Set dst (VectorCastF2X src));
7233   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr);
7234   format %{ "vector_cast_f2i $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
7235   ins_encode %{
7236     int vlen_enc = vector_length_encoding(this);
7237     __ vector_castF2I_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7238                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
7239                            ExternalAddress(vector_float_signflip()), $scratch$$Register, vlen_enc);
7240   %}
7241   ins_pipe( pipe_slow );
7242 %}
7243 
7244 instruct vcastDtoF_reg(vec dst, vec src) %{
7245   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
7246   match(Set dst (VectorCastD2X src));
7247   format %{ "vector_cast_d2x  $dst,$src\t!" %}
7248   ins_encode %{
7249     int vlen_enc = vector_length_encoding(this, $src);
7250     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7251   %}
7252   ins_pipe( pipe_slow );
7253 %}
7254 
7255 instruct vcastDtoL_reg_evex(vec dst, vec src, vec xtmp1, vec xtmp2, kReg ktmp1, kReg ktmp2, rRegP scratch, rFlagsReg cr) %{
7256   predicate(Matcher::vector_element_basic_type(n) == T_LONG);
7257   match(Set dst (VectorCastD2X src));
7258   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP ktmp1, TEMP ktmp2, TEMP scratch, KILL cr);
7259   format %{ "vector_cast_d2l $dst,$src\t! using $xtmp1, $xtmp2, $ktmp1 and $ktmp2 as TEMP" %}
7260   ins_encode %{
7261     int vlen_enc = vector_length_encoding(this);
7262     __ vector_castD2L_evex($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
7263                            $xtmp2$$XMMRegister, $ktmp1$$KRegister, $ktmp2$$KRegister,
7264                            ExternalAddress(vector_double_signflip()), $scratch$$Register, vlen_enc);
7265   %}
7266   ins_pipe( pipe_slow );
7267 %}
7268 
7269 instruct vucast(vec dst, vec src) %{
7270   match(Set dst (VectorUCastB2X src));
7271   match(Set dst (VectorUCastS2X src));
7272   match(Set dst (VectorUCastI2X src));
7273   format %{ "vector_ucast $dst,$src\t!" %}
7274   ins_encode %{
7275     assert(UseAVX > 0, "required");
7276 
7277     BasicType from_elem_bt = Matcher::vector_element_basic_type(this, $src);
7278     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
7279     int vlen_enc = vector_length_encoding(this);
7280     __ vector_unsigned_cast($dst$$XMMRegister, $src$$XMMRegister, vlen_enc, from_elem_bt, to_elem_bt);
7281   %}
7282   ins_pipe( pipe_slow );
7283 %}
7284 
7285 // --------------------------------- VectorMaskCmp --------------------------------------
7286 
7287 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
7288   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7289             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
7290             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7291             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7292   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7293   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
7294   ins_encode %{
7295     int vlen_enc = vector_length_encoding(this, $src1);
7296     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7297     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7298       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7299     } else {
7300       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7301     }
7302   %}
7303   ins_pipe( pipe_slow );
7304 %}
7305 
7306 instruct evcmpFD64(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
7307   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
7308             n->bottom_type()->isa_vectmask() == NULL &&
7309             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7310   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7311   effect(TEMP scratch, TEMP ktmp);
7312   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7313   ins_encode %{
7314     int vlen_enc = Assembler::AVX_512bit;
7315     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7316     KRegister mask = k0; // The comparison itself is not being masked.
7317     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7318       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7319       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
7320     } else {
7321       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7322       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
7323     }
7324   %}
7325   ins_pipe( pipe_slow );
7326 %}
7327 
7328 instruct evcmpFD(kReg dst, vec src1, vec src2, immI8 cond) %{
7329   predicate(n->bottom_type()->isa_vectmask() &&
7330             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
7331   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7332   format %{ "vector_compare_evex $dst,$src1,$src2,$cond\t!" %}
7333   ins_encode %{
7334     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
7335     int vlen_enc = vector_length_encoding(this, $src1);
7336     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
7337     KRegister mask = k0; // The comparison itself is not being masked.
7338     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
7339       __ evcmpps($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7340     } else {
7341       __ evcmppd($dst$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
7342     }
7343   %}
7344   ins_pipe( pipe_slow );
7345 %}
7346 
7347 instruct vcmp_direct(legVec dst, legVec src1, legVec src2, immI8 cond) %{
7348   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7349             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7350             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
7351             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7352             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
7353             (n->in(2)->get_int() == BoolTest::eq ||
7354              n->in(2)->get_int() == BoolTest::lt ||
7355              n->in(2)->get_int() == BoolTest::gt)); // cond
7356   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7357   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
7358   ins_encode %{
7359     int vlen_enc = vector_length_encoding(this, $src1);
7360     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7361     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
7362     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, cmp, ww, vlen_enc);
7363   %}
7364   ins_pipe( pipe_slow );
7365 %}
7366 
7367 instruct vcmp_negate(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
7368   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7369             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7370             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
7371             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7372             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1))) &&
7373             (n->in(2)->get_int() == BoolTest::ne ||
7374              n->in(2)->get_int() == BoolTest::le ||
7375              n->in(2)->get_int() == BoolTest::ge)); // cond
7376   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7377   effect(TEMP dst, TEMP xtmp);
7378   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
7379   ins_encode %{
7380     int vlen_enc = vector_length_encoding(this, $src1);
7381     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7382     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
7383     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
7384   %}
7385   ins_pipe( pipe_slow );
7386 %}
7387 
7388 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec xtmp) %{
7389   predicate(n->bottom_type()->isa_vectmask() == NULL &&
7390             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
7391             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
7392             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
7393             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7394   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7395   effect(TEMP dst, TEMP xtmp);
7396   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $xtmp as TEMP" %}
7397   ins_encode %{
7398     InternalAddress flip_bit = $constantaddress(high_bit_set(Matcher::vector_element_basic_type(this, $src1)));
7399     int vlen_enc = vector_length_encoding(this, $src1);
7400     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7401     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
7402 
7403     if (vlen_enc == Assembler::AVX_128bit) {
7404       __ vmovddup($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
7405     } else {
7406       __ vbroadcastsd($xtmp$$XMMRegister, flip_bit, vlen_enc, noreg);
7407     }
7408     __ vpxor($dst$$XMMRegister, $xtmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
7409     __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $src2$$XMMRegister, vlen_enc);
7410     __ vpcmpCCW($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, cmp, ww, vlen_enc);
7411   %}
7412   ins_pipe( pipe_slow );
7413 %}
7414 
7415 instruct vcmp64(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
7416   predicate((n->bottom_type()->isa_vectmask() == NULL &&
7417              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
7418              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7419   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7420   effect(TEMP scratch, TEMP ktmp);
7421   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7422   ins_encode %{
7423     assert(UseAVX > 2, "required");
7424 
7425     int vlen_enc = vector_length_encoding(this, $src1);
7426     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7427     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
7428     KRegister mask = k0; // The comparison itself is not being masked.
7429     bool merge = false;
7430     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
7431 
7432     switch (src1_elem_bt) {
7433       case T_INT: {
7434         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7435         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7436         break;
7437       }
7438       case T_LONG: {
7439         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7440         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7441         break;
7442       }
7443       default: assert(false, "%s", type2name(src1_elem_bt));
7444     }
7445   %}
7446   ins_pipe( pipe_slow );
7447 %}
7448 
7449 
7450 instruct evcmp(kReg dst, vec src1, vec src2, immI8 cond) %{
7451   predicate(n->bottom_type()->isa_vectmask() &&
7452             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7453   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7454   format %{ "vector_compared_evex $dst,$src1,$src2,$cond\t!" %}
7455   ins_encode %{
7456     assert(UseAVX > 2, "required");
7457     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
7458 
7459     int vlen_enc = vector_length_encoding(this, $src1);
7460     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7461     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
7462     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
7463 
7464     // Comparison i
7465     switch (src1_elem_bt) {
7466       case T_BYTE: {
7467         __ evpcmpb($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7468         break;
7469       }
7470       case T_SHORT: {
7471         __ evpcmpw($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7472         break;
7473       }
7474       case T_INT: {
7475         __ evpcmpd($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7476         break;
7477       }
7478       case T_LONG: {
7479         __ evpcmpq($dst$$KRegister, k0, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7480         break;
7481       }
7482       default: assert(false, "%s", type2name(src1_elem_bt));
7483     }
7484   %}
7485   ins_pipe( pipe_slow );
7486 %}
7487 
7488 // Extract
7489 
7490 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
7491   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
7492   match(Set dst (ExtractI src idx));
7493   match(Set dst (ExtractS src idx));
7494 #ifdef _LP64
7495   match(Set dst (ExtractB src idx));
7496 #endif
7497   format %{ "extractI $dst,$src,$idx\t!" %}
7498   ins_encode %{
7499     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7500 
7501     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7502     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7503   %}
7504   ins_pipe( pipe_slow );
7505 %}
7506 
7507 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
7508   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
7509             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
7510   match(Set dst (ExtractI src idx));
7511   match(Set dst (ExtractS src idx));
7512 #ifdef _LP64
7513   match(Set dst (ExtractB src idx));
7514 #endif
7515   effect(TEMP vtmp);
7516   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
7517   ins_encode %{
7518     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7519 
7520     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7521     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7522     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
7523   %}
7524   ins_pipe( pipe_slow );
7525 %}
7526 
7527 #ifdef _LP64
7528 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
7529   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
7530   match(Set dst (ExtractL src idx));
7531   format %{ "extractL $dst,$src,$idx\t!" %}
7532   ins_encode %{
7533     assert(UseSSE >= 4, "required");
7534     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7535 
7536     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7537   %}
7538   ins_pipe( pipe_slow );
7539 %}
7540 
7541 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
7542   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7543             Matcher::vector_length(n->in(1)) == 8);  // src
7544   match(Set dst (ExtractL src idx));
7545   effect(TEMP vtmp);
7546   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
7547   ins_encode %{
7548     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7549 
7550     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7551     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
7552   %}
7553   ins_pipe( pipe_slow );
7554 %}
7555 #endif
7556 
7557 instruct extractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7558   predicate(Matcher::vector_length(n->in(1)) <= 4);
7559   match(Set dst (ExtractF src idx));
7560   effect(TEMP dst, TEMP tmp, TEMP vtmp);
7561   format %{ "extractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7562   ins_encode %{
7563     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7564 
7565     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $tmp$$Register, $vtmp$$XMMRegister);
7566   %}
7567   ins_pipe( pipe_slow );
7568 %}
7569 
7570 instruct vextractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7571   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
7572             Matcher::vector_length(n->in(1)/*src*/) == 16);
7573   match(Set dst (ExtractF src idx));
7574   effect(TEMP tmp, TEMP vtmp);
7575   format %{ "vextractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7576   ins_encode %{
7577     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7578 
7579     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7580     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant, $tmp$$Register);
7581   %}
7582   ins_pipe( pipe_slow );
7583 %}
7584 
7585 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
7586   predicate(Matcher::vector_length(n->in(1)) == 2); // src
7587   match(Set dst (ExtractD src idx));
7588   format %{ "extractD $dst,$src,$idx\t!" %}
7589   ins_encode %{
7590     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7591 
7592     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7593   %}
7594   ins_pipe( pipe_slow );
7595 %}
7596 
7597 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
7598   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7599             Matcher::vector_length(n->in(1)) == 8);  // src
7600   match(Set dst (ExtractD src idx));
7601   effect(TEMP vtmp);
7602   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
7603   ins_encode %{
7604     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7605 
7606     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7607     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
7608   %}
7609   ins_pipe( pipe_slow );
7610 %}
7611 
7612 // --------------------------------- Vector Blend --------------------------------------
7613 
7614 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
7615   predicate(UseAVX == 0);
7616   match(Set dst (VectorBlend (Binary dst src) mask));
7617   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
7618   effect(TEMP tmp);
7619   ins_encode %{
7620     assert(UseSSE >= 4, "required");
7621 
7622     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
7623       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
7624     }
7625     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
7626   %}
7627   ins_pipe( pipe_slow );
7628 %}
7629 
7630 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
7631   predicate(UseAVX > 0 &&
7632             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
7633             Matcher::vector_length_in_bytes(n) <= 32 &&
7634             is_integral_type(Matcher::vector_element_basic_type(n)));
7635   match(Set dst (VectorBlend (Binary src1 src2) mask));
7636   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7637   ins_encode %{
7638     int vlen_enc = vector_length_encoding(this);
7639     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7640   %}
7641   ins_pipe( pipe_slow );
7642 %}
7643 
7644 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
7645   predicate(UseAVX > 0 &&
7646             n->in(2)->bottom_type()->isa_vectmask() == NULL &&
7647             Matcher::vector_length_in_bytes(n) <= 32 &&
7648             !is_integral_type(Matcher::vector_element_basic_type(n)));
7649   match(Set dst (VectorBlend (Binary src1 src2) mask));
7650   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7651   ins_encode %{
7652     int vlen_enc = vector_length_encoding(this);
7653     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7654   %}
7655   ins_pipe( pipe_slow );
7656 %}
7657 
7658 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, rRegP scratch, kReg ktmp) %{
7659   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
7660             n->in(2)->bottom_type()->isa_vectmask() == NULL);
7661   match(Set dst (VectorBlend (Binary src1 src2) mask));
7662   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
7663   effect(TEMP scratch, TEMP ktmp);
7664   ins_encode %{
7665      int vlen_enc = Assembler::AVX_512bit;
7666      BasicType elem_bt = Matcher::vector_element_basic_type(this);
7667     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, $scratch$$Register);
7668     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
7669   %}
7670   ins_pipe( pipe_slow );
7671 %}
7672 
7673 
7674 instruct evblendvp64_masked(vec dst, vec src1, vec src2, kReg mask, rRegP scratch) %{
7675   predicate(n->in(2)->bottom_type()->isa_vectmask() &&
7676             (!is_subword_type(Matcher::vector_element_basic_type(n)) ||
7677              VM_Version::supports_avx512bw()));
7678   match(Set dst (VectorBlend (Binary src1 src2) mask));
7679   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
7680   effect(TEMP scratch);
7681   ins_encode %{
7682     int vlen_enc = vector_length_encoding(this);
7683     BasicType elem_bt = Matcher::vector_element_basic_type(this);
7684     __ evpblend(elem_bt, $dst$$XMMRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
7685   %}
7686   ins_pipe( pipe_slow );
7687 %}
7688 
7689 // --------------------------------- ABS --------------------------------------
7690 // a = |a|
7691 instruct vabsB_reg(vec dst, vec src) %{
7692   match(Set dst (AbsVB  src));
7693   ins_cost(450);
7694   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
7695   ins_encode %{
7696     uint vlen = Matcher::vector_length(this);
7697     if (vlen <= 16) {
7698       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7699     } else {
7700       int vlen_enc = vector_length_encoding(this);
7701       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7702     }
7703   %}
7704   ins_pipe( pipe_slow );
7705 %}
7706 
7707 instruct vabsS_reg(vec dst, vec src) %{
7708   match(Set dst (AbsVS  src));
7709   ins_cost(450);
7710   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
7711   ins_encode %{
7712     uint vlen = Matcher::vector_length(this);
7713     if (vlen <= 8) {
7714       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7715     } else {
7716       int vlen_enc = vector_length_encoding(this);
7717       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7718     }
7719   %}
7720   ins_pipe( pipe_slow );
7721 %}
7722 
7723 instruct vabsI_reg(vec dst, vec src) %{
7724   match(Set dst (AbsVI  src));
7725   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
7726   ins_cost(250);
7727   ins_encode %{
7728     uint vlen = Matcher::vector_length(this);
7729     if (vlen <= 4) {
7730       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7731     } else {
7732       int vlen_enc = vector_length_encoding(this);
7733       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7734     }
7735   %}
7736   ins_pipe( pipe_slow );
7737 %}
7738 
7739 instruct vabsL_reg(vec dst, vec src) %{
7740   match(Set dst (AbsVL  src));
7741   ins_cost(450);
7742   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
7743   ins_encode %{
7744     assert(UseAVX > 2, "required");
7745     int vlen_enc = vector_length_encoding(this);
7746     if (!VM_Version::supports_avx512vl()) {
7747       vlen_enc = Assembler::AVX_512bit;
7748     }
7749     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7750   %}
7751   ins_pipe( pipe_slow );
7752 %}
7753 
7754 // --------------------------------- ABSNEG --------------------------------------
7755 
7756 instruct vabsnegF(vec dst, vec src, rRegI scratch) %{
7757   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
7758   match(Set dst (AbsVF src));
7759   match(Set dst (NegVF src));
7760   effect(TEMP scratch);
7761   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
7762   ins_cost(150);
7763   ins_encode %{
7764     int opcode = this->ideal_Opcode();
7765     int vlen = Matcher::vector_length(this);
7766     if (vlen == 2) {
7767       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7768     } else {
7769       assert(vlen == 8 || vlen == 16, "required");
7770       int vlen_enc = vector_length_encoding(this);
7771       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7772     }
7773   %}
7774   ins_pipe( pipe_slow );
7775 %}
7776 
7777 instruct vabsneg4F(vec dst, rRegI scratch) %{
7778   predicate(Matcher::vector_length(n) == 4);
7779   match(Set dst (AbsVF dst));
7780   match(Set dst (NegVF dst));
7781   effect(TEMP scratch);
7782   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
7783   ins_cost(150);
7784   ins_encode %{
7785     int opcode = this->ideal_Opcode();
7786     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $scratch$$Register);
7787   %}
7788   ins_pipe( pipe_slow );
7789 %}
7790 
7791 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
7792   match(Set dst (AbsVD  src));
7793   match(Set dst (NegVD  src));
7794   effect(TEMP scratch);
7795   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
7796   ins_encode %{
7797     int opcode = this->ideal_Opcode();
7798     uint vlen = Matcher::vector_length(this);
7799     if (vlen == 2) {
7800       assert(UseSSE >= 2, "required");
7801       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7802     } else {
7803       int vlen_enc = vector_length_encoding(this);
7804       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7805     }
7806   %}
7807   ins_pipe( pipe_slow );
7808 %}
7809 
7810 //------------------------------------- VectorTest --------------------------------------------
7811 
7812 #ifdef _LP64
7813 instruct vptest_alltrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp1, legVec vtmp2, rFlagsReg cr) %{
7814   predicate(!VM_Version::supports_avx512bwdq() &&
7815             Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
7816             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
7817             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7818   match(Set dst (VectorTest src1 src2 ));
7819   effect(TEMP vtmp1, TEMP vtmp2, KILL cr);
7820   format %{ "vptest_alltrue_lt16 $dst,$src1, $src2\t! using $vtmp1, $vtmp2 and $cr as TEMP" %}
7821   ins_encode %{
7822     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7823     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
7824     __ setb(Assembler::carrySet, $dst$$Register);
7825     __ movzbl($dst$$Register, $dst$$Register);
7826   %}
7827   ins_pipe( pipe_slow );
7828 %}
7829 
7830 instruct vptest_alltrue_ge16(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7831   predicate(!VM_Version::supports_avx512bwdq() &&
7832             Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
7833             Matcher::vector_length_in_bytes(n->in(1)) <  64 &&
7834             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7835   match(Set dst (VectorTest src1 src2 ));
7836   effect(KILL cr);
7837   format %{ "vptest_alltrue_ge16  $dst,$src1, $src2\t! using $cr as TEMP" %}
7838   ins_encode %{
7839     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7840     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7841     __ setb(Assembler::carrySet, $dst$$Register);
7842     __ movzbl($dst$$Register, $dst$$Register);
7843   %}
7844   ins_pipe( pipe_slow );
7845 %}
7846 
7847 instruct vptest_alltrue_lt8_evex(rRegI dst, kReg src1, kReg src2, kReg kscratch, rFlagsReg cr) %{
7848   predicate(VM_Version::supports_avx512bwdq() &&
7849             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow &&
7850             n->in(1)->bottom_type()->isa_vectmask() &&
7851             Matcher::vector_length(n->in(1)) < 8);
7852   match(Set dst (VectorTest src1 src2));
7853   effect(KILL cr, TEMP kscratch);
7854   format %{ "vptest_alltrue_lt8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
7855   ins_encode %{
7856     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7857     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7858     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7859     uint masklen = Matcher::vector_length(this, $src1);
7860     __ alltrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister, $kscratch$$KRegister);
7861   %}
7862   ins_pipe( pipe_slow );
7863 %}
7864 
7865 
7866 instruct vptest_alltrue_ge8_evex(rRegI dst, kReg src1, kReg src2, rFlagsReg cr) %{
7867   predicate(VM_Version::supports_avx512bwdq() &&
7868             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow &&
7869             n->in(1)->bottom_type()->isa_vectmask() &&
7870             Matcher::vector_length(n->in(1)) >= 8);
7871   match(Set dst (VectorTest src1 src2));
7872   effect(KILL cr);
7873   format %{ "vptest_alltrue_ge8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
7874   ins_encode %{
7875     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7876     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7877     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7878     uint masklen = Matcher::vector_length(this, $src1);
7879     __ alltrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister, knoreg);
7880   %}
7881   ins_pipe( pipe_slow );
7882 %}
7883 
7884 
7885 instruct vptest_anytrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp, rFlagsReg cr) %{
7886   predicate(!VM_Version::supports_avx512bwdq() &&
7887             Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
7888             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
7889             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7890   match(Set dst (VectorTest src1 src2 ));
7891   effect(TEMP vtmp, KILL cr);
7892   format %{ "vptest_anytrue_lt16 $dst,$src1,$src2\t! using $vtmp, $cr as TEMP" %}
7893   ins_encode %{
7894     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7895     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7896     __ setb(Assembler::notZero, $dst$$Register);
7897     __ movzbl($dst$$Register, $dst$$Register);
7898   %}
7899   ins_pipe( pipe_slow );
7900 %}
7901 
7902 instruct vptest_anytrue_ge16(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7903   predicate(!VM_Version::supports_avx512bwdq() &&
7904             Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
7905             Matcher::vector_length_in_bytes(n->in(1)) < 64  &&
7906             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7907   match(Set dst (VectorTest src1 src2 ));
7908   effect(KILL cr);
7909   format %{ "vptest_anytrue_ge16 $dst,$src1,$src2\t! using $cr as TEMP" %}
7910   ins_encode %{
7911     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7912     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7913     __ setb(Assembler::notZero, $dst$$Register);
7914     __ movzbl($dst$$Register, $dst$$Register);
7915   %}
7916   ins_pipe( pipe_slow );
7917 %}
7918 
7919 instruct vptest_anytrue_evex(rRegI dst, kReg src1, kReg src2, rFlagsReg cr) %{
7920   predicate(VM_Version::supports_avx512bwdq() &&
7921             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7922   match(Set dst (VectorTest src1 src2));
7923   effect(KILL cr);
7924   format %{ "vptest_anytrue_lt8_evex $dst,$src1,$src2\t! using $cr as TEMP" %}
7925   ins_encode %{
7926     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7927     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7928     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7929     uint  masklen = Matcher::vector_length(this, $src1);
7930     __ anytrue($dst$$Register, masklen, $src1$$KRegister, $src2$$KRegister);
7931   %}
7932   ins_pipe( pipe_slow );
7933 %}
7934 
7935 instruct cmpvptest_anytrue_lt16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero, legVec vtmp) %{
7936   predicate(!VM_Version::supports_avx512bwdq() &&
7937             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 4 &&
7938             Matcher::vector_length_in_bytes(n->in(1)->in(1)) < 16 &&
7939             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7940   match(Set cr (CmpI (VectorTest src1 src2) zero));
7941   effect(TEMP vtmp);
7942   format %{ "cmpvptest_anytrue_lt16 $src1,$src2\t! using $vtmp as TEMP" %}
7943   ins_encode %{
7944     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7945     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7946   %}
7947   ins_pipe( pipe_slow );
7948 %}
7949 
7950 instruct cmpvptest_anytrue_ge16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero) %{
7951   predicate(!VM_Version::supports_avx512bwdq() &&
7952             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 16 &&
7953             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <  64 &&
7954             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7955   match(Set cr (CmpI (VectorTest src1 src2) zero));
7956   format %{ "cmpvptest_anytrue_ge16 $src1,$src2\t!" %}
7957   ins_encode %{
7958     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7959     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7960   %}
7961   ins_pipe( pipe_slow );
7962 %}
7963 
7964 instruct cmpvptest_anytrue_evex(rFlagsReg cr, kReg src1, kReg src2, immI_0 zero) %{
7965   predicate(VM_Version::supports_avx512bwdq() &&
7966             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7967   match(Set cr (CmpI (VectorTest src1 src2) zero));
7968   format %{ "cmpvptest_anytrue_evex $src1,$src2\t!" %}
7969   ins_encode %{
7970     uint masklen = Matcher::vector_length(this, $src1);
7971     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
7972     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
7973     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
7974     masklen = masklen < 8 ? 8 : masklen;
7975     __ ktest(masklen, $src1$$KRegister, $src2$$KRegister);
7976   %}
7977   ins_pipe( pipe_slow );
7978 %}
7979 #endif
7980 
7981 //------------------------------------- LoadMask --------------------------------------------
7982 
7983 instruct loadMask(legVec dst, legVec src) %{
7984   predicate(n->bottom_type()->isa_vectmask() == NULL && !VM_Version::supports_avx512vlbw());
7985   match(Set dst (VectorLoadMask src));
7986   effect(TEMP dst);
7987   format %{ "vector_loadmask_byte $dst, $src\n\t" %}
7988   ins_encode %{
7989     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
7990     BasicType elem_bt = Matcher::vector_element_basic_type(this);
7991     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
7992   %}
7993   ins_pipe( pipe_slow );
7994 %}
7995 
7996 instruct loadMask64(kReg dst, vec src, vec xtmp, rRegI tmp) %{
7997   predicate(n->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
7998   match(Set dst (VectorLoadMask src));
7999   effect(TEMP xtmp, TEMP tmp);
8000   format %{ "vector_loadmask_64byte $dst, $src\t! using $xtmp and $tmp as TEMP" %}
8001   ins_encode %{
8002     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
8003                         $tmp$$Register, true, Assembler::AVX_512bit);
8004   %}
8005   ins_pipe( pipe_slow );
8006 %}
8007 
8008 instruct loadMask_evex(kReg dst, vec src,  vec xtmp) %{
8009   predicate(n->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
8010   match(Set dst (VectorLoadMask src));
8011   effect(TEMP xtmp);
8012   format %{ "vector_loadmask_byte $dst, $src\t! using $xtmp as TEMP" %}
8013   ins_encode %{
8014     int vlen_enc = vector_length_encoding(in(1));
8015     __ load_vector_mask($dst$$KRegister, $src$$XMMRegister, $xtmp$$XMMRegister,
8016                         noreg, false, vlen_enc);
8017   %}
8018   ins_pipe( pipe_slow );
8019 %}
8020 
8021 //------------------------------------- StoreMask --------------------------------------------
8022 
8023 instruct vstoreMask1B(vec dst, vec src, immI_1 size) %{
8024   predicate(Matcher::vector_length(n) < 64 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8025   match(Set dst (VectorStoreMask src size));
8026   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8027   ins_encode %{
8028     int vlen = Matcher::vector_length(this);
8029     if (vlen <= 16 && UseAVX <= 2) {
8030       assert(UseSSE >= 3, "required");
8031       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
8032     } else {
8033       assert(UseAVX > 0, "required");
8034       int src_vlen_enc = vector_length_encoding(this, $src);
8035       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
8036     }
8037   %}
8038   ins_pipe( pipe_slow );
8039 %}
8040 
8041 instruct vstoreMask2B(vec dst, vec src, vec xtmp, immI_2 size) %{
8042   predicate(Matcher::vector_length(n) <= 16 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8043   match(Set dst (VectorStoreMask src size));
8044   effect(TEMP_DEF dst, TEMP xtmp);
8045   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8046   ins_encode %{
8047     int vlen_enc = Assembler::AVX_128bit;
8048     int vlen = Matcher::vector_length(this);
8049     if (vlen <= 8) {
8050       assert(UseSSE >= 3, "required");
8051       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
8052       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
8053       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
8054     } else {
8055       assert(UseAVX > 0, "required");
8056       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
8057       __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8058       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8059     }
8060   %}
8061   ins_pipe( pipe_slow );
8062 %}
8063 
8064 instruct vstoreMask4B(vec dst, vec src, vec xtmp, immI_4 size) %{
8065   predicate(UseAVX <= 2 && Matcher::vector_length(n) <= 8 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8066   match(Set dst (VectorStoreMask src size));
8067   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8068   effect(TEMP_DEF dst, TEMP xtmp);
8069   ins_encode %{
8070     int vlen_enc = Assembler::AVX_128bit;
8071     int vlen = Matcher::vector_length(this);
8072     if (vlen <= 4) {
8073       assert(UseSSE >= 3, "required");
8074       __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
8075       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
8076       __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
8077       __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
8078     } else {
8079       assert(UseAVX > 0, "required");
8080       __ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
8081       __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
8082       __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8083       __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
8084       __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8085     }
8086   %}
8087   ins_pipe( pipe_slow );
8088 %}
8089 
8090 instruct storeMask8B(vec dst, vec src, vec xtmp, immI_8 size) %{
8091   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 2);
8092   match(Set dst (VectorStoreMask src size));
8093   effect(TEMP_DEF dst, TEMP xtmp);
8094   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8095   ins_encode %{
8096     assert(UseSSE >= 3, "required");
8097     __ pxor($xtmp$$XMMRegister, $xtmp$$XMMRegister);
8098     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
8099     __ pabsd($dst$$XMMRegister, $dst$$XMMRegister);
8100     __ packusdw($dst$$XMMRegister, $xtmp$$XMMRegister);
8101     __ packuswb($dst$$XMMRegister, $xtmp$$XMMRegister);
8102   %}
8103   ins_pipe( pipe_slow );
8104 %}
8105 
8106 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, vec vtmp) %{
8107   predicate(UseAVX <= 2 && Matcher::vector_length(n) == 4);
8108   match(Set dst (VectorStoreMask src size));
8109   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s], using $vtmp as TEMP" %}
8110   effect(TEMP_DEF dst, TEMP vtmp);
8111   ins_encode %{
8112     int vlen_enc = Assembler::AVX_128bit;
8113     __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
8114     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
8115     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
8116     __ vpxor($vtmp$$XMMRegister, $vtmp$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8117     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8118     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8119     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8120   %}
8121   ins_pipe( pipe_slow );
8122 %}
8123 
8124 instruct vstoreMask4B_evex_novectmask(vec dst, vec src, immI_4 size) %{
8125   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8126   match(Set dst (VectorStoreMask src size));
8127   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8128   ins_encode %{
8129     int src_vlen_enc = vector_length_encoding(this, $src);
8130     int dst_vlen_enc = vector_length_encoding(this);
8131     if (!VM_Version::supports_avx512vl()) {
8132       src_vlen_enc = Assembler::AVX_512bit;
8133     }
8134     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
8135     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8136   %}
8137   ins_pipe( pipe_slow );
8138 %}
8139 
8140 instruct vstoreMask8B_evex_novectmask(vec dst, vec src, immI_8 size) %{
8141   predicate(UseAVX > 2 && n->in(1)->bottom_type()->isa_vectmask() == NULL);
8142   match(Set dst (VectorStoreMask src size));
8143   format %{ "vector_store_mask $dst, $src \t! elem size is $size byte[s]" %}
8144   ins_encode %{
8145     int src_vlen_enc = vector_length_encoding(this, $src);
8146     int dst_vlen_enc = vector_length_encoding(this);
8147     if (!VM_Version::supports_avx512vl()) {
8148       src_vlen_enc = Assembler::AVX_512bit;
8149     }
8150     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
8151     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8152   %}
8153   ins_pipe( pipe_slow );
8154 %}
8155 
8156 instruct vstoreMask_evex_vectmask(vec dst, kReg mask, immI size, rRegI tmp) %{
8157   predicate(n->in(1)->bottom_type()->isa_vectmask() && !VM_Version::supports_avx512vlbw());
8158   match(Set dst (VectorStoreMask mask size));
8159   effect(TEMP_DEF dst, TEMP tmp);
8160   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
8161   ins_encode %{
8162     assert(Matcher::vector_length_in_bytes(this, $mask) == 64, "");
8163     __ evmovdqul($dst$$XMMRegister, $mask$$KRegister, ExternalAddress(vector_int_mask_cmp_bits()),
8164                  false, Assembler::AVX_512bit, $tmp$$Register);
8165     __ evpmovdb($dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_512bit);
8166   %}
8167   ins_pipe( pipe_slow );
8168 %}
8169 
8170 instruct vstoreMask_evex(vec dst, kReg mask, immI size) %{
8171   predicate(n->in(1)->bottom_type()->isa_vectmask() && VM_Version::supports_avx512vlbw());
8172   match(Set dst (VectorStoreMask mask size));
8173   effect(TEMP_DEF dst);
8174   format %{ "vector_store_mask $dst, $mask \t! elem size is $size byte[s]" %}
8175   ins_encode %{
8176     int dst_vlen_enc = vector_length_encoding(this);
8177     __ evpmovm2b($dst$$XMMRegister, $mask$$KRegister, dst_vlen_enc);
8178     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
8179   %}
8180   ins_pipe( pipe_slow );
8181 %}
8182 
8183 instruct vmaskcast_evex(kReg dst) %{
8184   predicate(Matcher::vector_length(n) == Matcher::vector_length(n->in(1)));
8185   match(Set dst (VectorMaskCast dst));
8186   ins_cost(0);
8187   format %{ "vector_mask_cast $dst" %}
8188   ins_encode %{
8189     // empty
8190   %}
8191   ins_pipe(empty);
8192 %}
8193 
8194 instruct vmaskcast(vec dst) %{
8195   predicate((Matcher::vector_length(n) == Matcher::vector_length(n->in(1))) &&
8196             (Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))));
8197   match(Set dst (VectorMaskCast dst));
8198   ins_cost(0);
8199   format %{ "vector_mask_cast $dst" %}
8200   ins_encode %{
8201     // empty
8202   %}
8203   ins_pipe(empty);
8204 %}
8205 
8206 //-------------------------------- Load Iota Indices ----------------------------------
8207 
8208 instruct loadIotaIndices(vec dst, immI_0 src, rRegP scratch) %{
8209   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
8210   match(Set dst (VectorLoadConst src));
8211   effect(TEMP scratch);
8212   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
8213   ins_encode %{
8214      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8215      __ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen_in_bytes);
8216   %}
8217   ins_pipe( pipe_slow );
8218 %}
8219 
8220 //-------------------------------- Rearrange ----------------------------------
8221 
8222 // LoadShuffle/Rearrange for Byte
8223 
8224 instruct loadShuffleB(vec dst) %{
8225   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
8226   match(Set dst (VectorLoadShuffle dst));
8227   format %{ "vector_load_shuffle $dst, $dst" %}
8228   ins_encode %{
8229     // empty
8230   %}
8231   ins_pipe( pipe_slow );
8232 %}
8233 
8234 instruct rearrangeB(vec dst, vec shuffle) %{
8235   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8236             Matcher::vector_length(n) < 32);
8237   match(Set dst (VectorRearrange dst shuffle));
8238   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8239   ins_encode %{
8240     assert(UseSSE >= 4, "required");
8241     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8242   %}
8243   ins_pipe( pipe_slow );
8244 %}
8245 
8246 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
8247   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8248             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
8249   match(Set dst (VectorRearrange src shuffle));
8250   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
8251   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
8252   ins_encode %{
8253     assert(UseAVX >= 2, "required");
8254     // Swap src into vtmp1
8255     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
8256     // Shuffle swapped src to get entries from other 128 bit lane
8257     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8258     // Shuffle original src to get entries from self 128 bit lane
8259     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8260     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
8261     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
8262     // Perform the blend
8263     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
8264   %}
8265   ins_pipe( pipe_slow );
8266 %}
8267 
8268 instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{
8269   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
8270             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
8271   match(Set dst (VectorRearrange src shuffle));
8272   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8273   ins_encode %{
8274     int vlen_enc = vector_length_encoding(this);
8275     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8276   %}
8277   ins_pipe( pipe_slow );
8278 %}
8279 
8280 // LoadShuffle/Rearrange for Short
8281 
8282 instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{
8283   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8284             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
8285   match(Set dst (VectorLoadShuffle src));
8286   effect(TEMP dst, TEMP vtmp, TEMP scratch);
8287   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
8288   ins_encode %{
8289     // Create a byte shuffle mask from short shuffle mask
8290     // only byte shuffle instruction available on these platforms
8291     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8292     if (UseAVX == 0) {
8293       assert(vlen_in_bytes <= 16, "required");
8294       // Multiply each shuffle by two to get byte index
8295       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
8296       __ psllw($vtmp$$XMMRegister, 1);
8297 
8298       // Duplicate to create 2 copies of byte index
8299       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
8300       __ psllw($dst$$XMMRegister, 8);
8301       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
8302 
8303       // Add one to get alternate byte index
8304       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
8305       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
8306     } else {
8307       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
8308       int vlen_enc = vector_length_encoding(this);
8309       // Multiply each shuffle by two to get byte index
8310       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
8311       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
8312 
8313       // Duplicate to create 2 copies of byte index
8314       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
8315       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8316 
8317       // Add one to get alternate byte index
8318       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, $scratch$$Register);
8319     }
8320   %}
8321   ins_pipe( pipe_slow );
8322 %}
8323 
8324 instruct rearrangeS(vec dst, vec shuffle) %{
8325   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8326             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
8327   match(Set dst (VectorRearrange dst shuffle));
8328   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8329   ins_encode %{
8330     assert(UseSSE >= 4, "required");
8331     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8332   %}
8333   ins_pipe( pipe_slow );
8334 %}
8335 
8336 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
8337   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8338             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
8339   match(Set dst (VectorRearrange src shuffle));
8340   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
8341   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
8342   ins_encode %{
8343     assert(UseAVX >= 2, "required");
8344     // Swap src into vtmp1
8345     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
8346     // Shuffle swapped src to get entries from other 128 bit lane
8347     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8348     // Shuffle original src to get entries from self 128 bit lane
8349     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
8350     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
8351     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
8352     // Perform the blend
8353     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
8354   %}
8355   ins_pipe( pipe_slow );
8356 %}
8357 
8358 instruct loadShuffleS_evex(vec dst, vec src) %{
8359   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8360             VM_Version::supports_avx512bw());
8361   match(Set dst (VectorLoadShuffle src));
8362   format %{ "vector_load_shuffle $dst, $src" %}
8363   ins_encode %{
8364     int vlen_enc = vector_length_encoding(this);
8365     if (!VM_Version::supports_avx512vl()) {
8366       vlen_enc = Assembler::AVX_512bit;
8367     }
8368     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8369   %}
8370   ins_pipe( pipe_slow );
8371 %}
8372 
8373 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
8374   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
8375             VM_Version::supports_avx512bw());
8376   match(Set dst (VectorRearrange src shuffle));
8377   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8378   ins_encode %{
8379     int vlen_enc = vector_length_encoding(this);
8380     if (!VM_Version::supports_avx512vl()) {
8381       vlen_enc = Assembler::AVX_512bit;
8382     }
8383     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8384   %}
8385   ins_pipe( pipe_slow );
8386 %}
8387 
8388 // LoadShuffle/Rearrange for Integer and Float
8389 
8390 instruct loadShuffleI(vec dst, vec src, vec vtmp, rRegP scratch) %{
8391   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8392             Matcher::vector_length(n) == 4 && UseAVX < 2);
8393   match(Set dst (VectorLoadShuffle src));
8394   effect(TEMP dst, TEMP vtmp, TEMP scratch);
8395   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
8396   ins_encode %{
8397     assert(UseSSE >= 4, "required");
8398 
8399     // Create a byte shuffle mask from int shuffle mask
8400     // only byte shuffle instruction available on these platforms
8401 
8402     // Duplicate and multiply each shuffle by 4
8403     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
8404     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
8405     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
8406     __ psllw($vtmp$$XMMRegister, 2);
8407 
8408     // Duplicate again to create 4 copies of byte index
8409     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
8410     __ psllw($dst$$XMMRegister, 8);
8411     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
8412 
8413     // Add 3,2,1,0 to get alternate byte index
8414     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), $scratch$$Register);
8415     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
8416   %}
8417   ins_pipe( pipe_slow );
8418 %}
8419 
8420 instruct rearrangeI(vec dst, vec shuffle) %{
8421  predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8422            Matcher::vector_length(n) == 4 && UseAVX < 2);
8423   match(Set dst (VectorRearrange dst shuffle));
8424   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
8425   ins_encode %{
8426     assert(UseSSE >= 4, "required");
8427     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
8428   %}
8429   ins_pipe( pipe_slow );
8430 %}
8431 
8432 instruct loadShuffleI_avx(vec dst, vec src) %{
8433   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8434             UseAVX >= 2);
8435   match(Set dst (VectorLoadShuffle src));
8436   format %{ "vector_load_shuffle $dst, $src" %}
8437   ins_encode %{
8438   int vlen_enc = vector_length_encoding(this);
8439     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8440   %}
8441   ins_pipe( pipe_slow );
8442 %}
8443 
8444 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
8445   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
8446             UseAVX >= 2);
8447   match(Set dst (VectorRearrange src shuffle));
8448   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8449   ins_encode %{
8450     int vlen_enc = vector_length_encoding(this);
8451     if (vlen_enc == Assembler::AVX_128bit) {
8452       vlen_enc = Assembler::AVX_256bit;
8453     }
8454     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8455   %}
8456   ins_pipe( pipe_slow );
8457 %}
8458 
8459 // LoadShuffle/Rearrange for Long and Double
8460 
8461 instruct loadShuffleL(vec dst, vec src, vec vtmp, rRegP scratch) %{
8462   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8463             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
8464   match(Set dst (VectorLoadShuffle src));
8465   effect(TEMP dst, TEMP vtmp, TEMP scratch);
8466   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
8467   ins_encode %{
8468     assert(UseAVX >= 2, "required");
8469 
8470     int vlen_enc = vector_length_encoding(this);
8471     // Create a double word shuffle mask from long shuffle mask
8472     // only double word shuffle instruction available on these platforms
8473 
8474     // Multiply each shuffle by two to get double word index
8475     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
8476     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
8477 
8478     // Duplicate each double word shuffle
8479     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
8480     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
8481 
8482     // Add one to get alternate double word index
8483     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, $scratch$$Register);
8484   %}
8485   ins_pipe( pipe_slow );
8486 %}
8487 
8488 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
8489   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8490             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
8491   match(Set dst (VectorRearrange src shuffle));
8492   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8493   ins_encode %{
8494     assert(UseAVX >= 2, "required");
8495 
8496     int vlen_enc = vector_length_encoding(this);
8497     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8498   %}
8499   ins_pipe( pipe_slow );
8500 %}
8501 
8502 instruct loadShuffleL_evex(vec dst, vec src) %{
8503   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8504             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
8505   match(Set dst (VectorLoadShuffle src));
8506   format %{ "vector_load_shuffle $dst, $src" %}
8507   ins_encode %{
8508     assert(UseAVX > 2, "required");
8509 
8510     int vlen_enc = vector_length_encoding(this);
8511     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8512   %}
8513   ins_pipe( pipe_slow );
8514 %}
8515 
8516 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
8517   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
8518             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
8519   match(Set dst (VectorRearrange src shuffle));
8520   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8521   ins_encode %{
8522     assert(UseAVX > 2, "required");
8523 
8524     int vlen_enc = vector_length_encoding(this);
8525     if (vlen_enc == Assembler::AVX_128bit) {
8526       vlen_enc = Assembler::AVX_256bit;
8527     }
8528     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8529   %}
8530   ins_pipe( pipe_slow );
8531 %}
8532 
8533 // --------------------------------- FMA --------------------------------------
8534 // a * b + c
8535 
8536 instruct vfmaF_reg(vec a, vec b, vec c) %{
8537   match(Set c (FmaVF  c (Binary a b)));
8538   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8539   ins_cost(150);
8540   ins_encode %{
8541     assert(UseFMA, "not enabled");
8542     int vlen_enc = vector_length_encoding(this);
8543     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8544   %}
8545   ins_pipe( pipe_slow );
8546 %}
8547 
8548 instruct vfmaF_mem(vec a, memory b, vec c) %{
8549   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8550   match(Set c (FmaVF  c (Binary a (LoadVector b))));
8551   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8552   ins_cost(150);
8553   ins_encode %{
8554     assert(UseFMA, "not enabled");
8555     int vlen_enc = vector_length_encoding(this);
8556     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8557   %}
8558   ins_pipe( pipe_slow );
8559 %}
8560 
8561 instruct vfmaD_reg(vec a, vec b, vec c) %{
8562   match(Set c (FmaVD  c (Binary a b)));
8563   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8564   ins_cost(150);
8565   ins_encode %{
8566     assert(UseFMA, "not enabled");
8567     int vlen_enc = vector_length_encoding(this);
8568     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8569   %}
8570   ins_pipe( pipe_slow );
8571 %}
8572 
8573 instruct vfmaD_mem(vec a, memory b, vec c) %{
8574   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8575   match(Set c (FmaVD  c (Binary a (LoadVector b))));
8576   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8577   ins_cost(150);
8578   ins_encode %{
8579     assert(UseFMA, "not enabled");
8580     int vlen_enc = vector_length_encoding(this);
8581     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8582   %}
8583   ins_pipe( pipe_slow );
8584 %}
8585 
8586 // --------------------------------- Vector Multiply Add --------------------------------------
8587 
8588 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
8589   predicate(UseAVX == 0);
8590   match(Set dst (MulAddVS2VI dst src1));
8591   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
8592   ins_encode %{
8593     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
8594   %}
8595   ins_pipe( pipe_slow );
8596 %}
8597 
8598 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
8599   predicate(UseAVX > 0);
8600   match(Set dst (MulAddVS2VI src1 src2));
8601   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
8602   ins_encode %{
8603     int vlen_enc = vector_length_encoding(this);
8604     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8605   %}
8606   ins_pipe( pipe_slow );
8607 %}
8608 
8609 // --------------------------------- Vector Multiply Add Add ----------------------------------
8610 
8611 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
8612   predicate(VM_Version::supports_avx512_vnni());
8613   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
8614   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
8615   ins_encode %{
8616     assert(UseAVX > 2, "required");
8617     int vlen_enc = vector_length_encoding(this);
8618     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8619   %}
8620   ins_pipe( pipe_slow );
8621   ins_cost(10);
8622 %}
8623 
8624 // --------------------------------- PopCount --------------------------------------
8625 
8626 instruct vpopcountI_popcntd(vec dst, vec src) %{
8627   predicate(VM_Version::supports_avx512_vpopcntdq());
8628   match(Set dst (PopCountVI src));
8629   format %{ "vector_popcount_int $dst, $src\t! vector popcount packedI" %}
8630   ins_encode %{
8631     assert(UsePopCountInstruction, "not enabled");
8632     int vlen_enc = vector_length_encoding(this);
8633     __ vector_popcount_int($dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg, noreg, vlen_enc);
8634   %}
8635   ins_pipe( pipe_slow );
8636 %}
8637 
8638 instruct vpopcountI(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp, rFlagsReg cc) %{
8639   predicate(!VM_Version::supports_avx512_vpopcntdq());
8640   match(Set dst (PopCountVI src));
8641   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, KILL cc);
8642   format %{ "vector_popcount_int  $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
8643   ins_encode %{
8644     assert(UsePopCountInstruction, "not enabled");
8645     int vlen_enc = vector_length_encoding(this);
8646     __ vector_popcount_int($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
8647                            $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
8648   %}
8649   ins_pipe( pipe_slow );
8650 %}
8651 
8652 instruct vpopcountL_popcntd(vec dst, vec src) %{
8653   predicate(VM_Version::supports_avx512_vpopcntdq());
8654   match(Set dst (PopCountVL src));
8655   format %{ "vector_popcount_long  $dst, $src\t! vector popcount packedL" %}
8656   ins_encode %{
8657     assert(UsePopCountInstruction, "not enabled");
8658     int vlen_enc = vector_length_encoding(this, $src);
8659     __ vector_popcount_long($dst$$XMMRegister, $src$$XMMRegister, xnoreg, xnoreg, xnoreg, noreg, vlen_enc);
8660   %}
8661   ins_pipe( pipe_slow );
8662 %}
8663 
8664 instruct vpopcountL(vec dst, vec src, vec xtmp1, vec xtmp2, vec xtmp3, rRegP rtmp, rFlagsReg cc) %{
8665   predicate(!VM_Version::supports_avx512_vpopcntdq());
8666   match(Set dst (PopCountVL src));
8667   effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, KILL cc);
8668   format %{ "vector_popcount_long  $dst, $src\t! using $xtmp1, $xtmp2, $xtmp3, and $rtmp as TEMP" %}
8669   ins_encode %{
8670     assert(UsePopCountInstruction, "not enabled");
8671     int vlen_enc = vector_length_encoding(this, $src);
8672     __ vector_popcount_long($dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister,
8673                            $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
8674   %}
8675   ins_pipe( pipe_slow );
8676 %}
8677 
8678 // --------------------------------- Bitwise Ternary Logic ----------------------------------
8679 
8680 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
8681   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
8682   effect(TEMP dst);
8683   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8684   ins_encode %{
8685     int vector_len = vector_length_encoding(this);
8686     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
8687   %}
8688   ins_pipe( pipe_slow );
8689 %}
8690 
8691 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
8692   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
8693   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
8694   effect(TEMP dst);
8695   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8696   ins_encode %{
8697     int vector_len = vector_length_encoding(this);
8698     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
8699   %}
8700   ins_pipe( pipe_slow );
8701 %}
8702 
8703 // --------------------------------- Rotation Operations ----------------------------------
8704 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
8705   match(Set dst (RotateLeftV src shift));
8706   match(Set dst (RotateRightV src shift));
8707   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
8708   ins_encode %{
8709     int opcode      = this->ideal_Opcode();
8710     int vector_len  = vector_length_encoding(this);
8711     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8712     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
8713   %}
8714   ins_pipe( pipe_slow );
8715 %}
8716 
8717 instruct vprorate(vec dst, vec src, vec shift) %{
8718   match(Set dst (RotateLeftV src shift));
8719   match(Set dst (RotateRightV src shift));
8720   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
8721   ins_encode %{
8722     int opcode      = this->ideal_Opcode();
8723     int vector_len  = vector_length_encoding(this);
8724     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8725     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8726   %}
8727   ins_pipe( pipe_slow );
8728 %}
8729 
8730 #ifdef _LP64
8731 // ---------------------------------- Masked Operations ------------------------------------
8732 
8733 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
8734   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
8735   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
8736   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
8737   ins_encode %{
8738     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
8739     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
8740 
8741     Label DONE;
8742     int vlen_enc = vector_length_encoding(this, $src1);
8743     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
8744 
8745     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
8746     __ mov64($dst$$Register, -1L);
8747     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
8748     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
8749     __ jccb(Assembler::carrySet, DONE);
8750     __ kmovql($dst$$Register, $ktmp1$$KRegister);
8751     __ notq($dst$$Register);
8752     __ tzcntq($dst$$Register, $dst$$Register);
8753     __ bind(DONE);
8754   %}
8755   ins_pipe( pipe_slow );
8756 %}
8757 
8758 
8759 instruct vmasked_load64(vec dst, memory mem, kReg mask) %{
8760   match(Set dst (LoadVectorMasked mem mask));
8761   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
8762   ins_encode %{
8763     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
8764     int vector_len = vector_length_encoding(this);
8765     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, vector_len);
8766   %}
8767   ins_pipe( pipe_slow );
8768 %}
8769 
8770 instruct vmask_gen(kReg dst, rRegL len, rRegL temp) %{
8771   match(Set dst (VectorMaskGen len));
8772   effect(TEMP temp);
8773   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
8774   ins_encode %{
8775     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
8776   %}
8777   ins_pipe( pipe_slow );
8778 %}
8779 
8780 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
8781   match(Set dst (VectorMaskGen len));
8782   format %{ "vector_mask_gen $len \t! vector mask generator" %}
8783   effect(TEMP temp);
8784   ins_encode %{
8785     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
8786     __ kmovql($dst$$KRegister, $temp$$Register);
8787   %}
8788   ins_pipe( pipe_slow );
8789 %}
8790 
8791 instruct vmasked_store64(memory mem, vec src, kReg mask) %{
8792   match(Set mem (StoreVectorMasked mem (Binary src mask)));
8793   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
8794   ins_encode %{
8795     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
8796     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
8797     int vector_len = vector_length_encoding(src_node);
8798     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, vector_len);
8799   %}
8800   ins_pipe( pipe_slow );
8801 %}
8802 
8803 instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
8804   predicate(n->in(1)->bottom_type()->isa_vectmask());
8805   match(Set dst (VectorMaskToLong mask));
8806   effect(TEMP dst, KILL cr);
8807   format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
8808   ins_encode %{
8809     int opcode = this->ideal_Opcode();
8810     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8811     int mask_len = Matcher::vector_length(this, $mask);
8812     int mask_size = mask_len * type2aelembytes(mbt);
8813     int vlen_enc = vector_length_encoding(this, $mask);
8814     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
8815                              $dst$$Register, mask_len, mask_size, vlen_enc);
8816   %}
8817   ins_pipe( pipe_slow );
8818 %}
8819 
8820 instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
8821   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
8822   match(Set dst (VectorMaskToLong mask));
8823   format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
8824   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
8825   ins_encode %{
8826     int opcode = this->ideal_Opcode();
8827     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8828     int mask_len = Matcher::vector_length(this, $mask);
8829     int vlen_enc = vector_length_encoding(this, $mask);
8830     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8831                              $dst$$Register, mask_len, mbt, vlen_enc);
8832   %}
8833   ins_pipe( pipe_slow );
8834 %}
8835 
8836 instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
8837   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
8838   match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
8839   format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
8840   effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
8841   ins_encode %{
8842     int opcode = this->ideal_Opcode();
8843     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8844     int mask_len = Matcher::vector_length(this, $mask);
8845     int vlen_enc = vector_length_encoding(this, $mask);
8846     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8847                              $dst$$Register, mask_len, mbt, vlen_enc);
8848   %}
8849   ins_pipe( pipe_slow );
8850 %}
8851 
8852 instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
8853   predicate(n->in(1)->bottom_type()->isa_vectmask());
8854   match(Set dst (VectorMaskTrueCount mask));
8855   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
8856   format %{ "vector_truecount_evex $dst, $mask \t! using $tmp as TEMP" %}
8857   ins_encode %{
8858     int opcode = this->ideal_Opcode();
8859     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8860     int mask_len = Matcher::vector_length(this, $mask);
8861     int mask_size = mask_len * type2aelembytes(mbt);
8862     int vlen_enc = vector_length_encoding(this, $mask);
8863     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
8864                              $tmp$$Register, mask_len, mask_size, vlen_enc);
8865   %}
8866   ins_pipe( pipe_slow );
8867 %}
8868 
8869 instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
8870   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
8871   match(Set dst (VectorMaskTrueCount mask));
8872   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
8873   format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
8874   ins_encode %{
8875     int opcode = this->ideal_Opcode();
8876     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8877     int mask_len = Matcher::vector_length(this, $mask);
8878     int vlen_enc = vector_length_encoding(this, $mask);
8879     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8880                              $tmp$$Register, mask_len, mbt, vlen_enc);
8881   %}
8882   ins_pipe( pipe_slow );
8883 %}
8884 
8885 instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
8886   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
8887   match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
8888   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
8889   format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
8890   ins_encode %{
8891     int opcode = this->ideal_Opcode();
8892     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8893     int mask_len = Matcher::vector_length(this, $mask);
8894     int vlen_enc = vector_length_encoding(this, $mask);
8895     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8896                              $tmp$$Register, mask_len, mbt, vlen_enc);
8897   %}
8898   ins_pipe( pipe_slow );
8899 %}
8900 
8901 instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
8902   predicate(n->in(1)->bottom_type()->isa_vectmask());
8903   match(Set dst (VectorMaskFirstTrue mask));
8904   match(Set dst (VectorMaskLastTrue mask));
8905   effect(TEMP_DEF dst, TEMP tmp, KILL cr);
8906   format %{ "vector_mask_first_or_last_true_evex $dst, $mask \t! using $tmp as TEMP" %}
8907   ins_encode %{
8908     int opcode = this->ideal_Opcode();
8909     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8910     int mask_len = Matcher::vector_length(this, $mask);
8911     int mask_size = mask_len * type2aelembytes(mbt);
8912     int vlen_enc = vector_length_encoding(this, $mask);
8913     __ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
8914                              $tmp$$Register, mask_len, mask_size, vlen_enc);
8915   %}
8916   ins_pipe( pipe_slow );
8917 %}
8918 
8919 instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
8920   predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
8921   match(Set dst (VectorMaskFirstTrue mask));
8922   match(Set dst (VectorMaskLastTrue mask));
8923   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
8924   format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
8925   ins_encode %{
8926     int opcode = this->ideal_Opcode();
8927     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8928     int mask_len = Matcher::vector_length(this, $mask);
8929     int vlen_enc = vector_length_encoding(this, $mask);
8930     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8931                              $tmp$$Register, mask_len, mbt, vlen_enc);
8932   %}
8933   ins_pipe( pipe_slow );
8934 %}
8935 
8936 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
8937   predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
8938   match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
8939   match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
8940   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
8941   format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
8942   ins_encode %{
8943     int opcode = this->ideal_Opcode();
8944     BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
8945     int mask_len = Matcher::vector_length(this, $mask);
8946     int vlen_enc = vector_length_encoding(this, $mask);
8947     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8948                              $tmp$$Register, mask_len, mbt, vlen_enc);
8949   %}
8950   ins_pipe( pipe_slow );
8951 %}
8952 #endif // _LP64
8953 
8954 // ---------------------------------- Vector Masked Operations ------------------------------------
8955 
8956 instruct vadd_reg_masked(vec dst, vec src2, kReg mask) %{
8957   match(Set dst (AddVB (Binary dst src2) mask));
8958   match(Set dst (AddVS (Binary dst src2) mask));
8959   match(Set dst (AddVI (Binary dst src2) mask));
8960   match(Set dst (AddVL (Binary dst src2) mask));
8961   match(Set dst (AddVF (Binary dst src2) mask));
8962   match(Set dst (AddVD (Binary dst src2) mask));
8963   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
8964   ins_encode %{
8965     int vlen_enc = vector_length_encoding(this);
8966     BasicType bt = Matcher::vector_element_basic_type(this);
8967     int opc = this->ideal_Opcode();
8968     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8969                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
8970   %}
8971   ins_pipe( pipe_slow );
8972 %}
8973 
8974 instruct vadd_mem_masked(vec dst, memory src2, kReg mask) %{
8975   match(Set dst (AddVB (Binary dst (LoadVector src2)) mask));
8976   match(Set dst (AddVS (Binary dst (LoadVector src2)) mask));
8977   match(Set dst (AddVI (Binary dst (LoadVector src2)) mask));
8978   match(Set dst (AddVL (Binary dst (LoadVector src2)) mask));
8979   match(Set dst (AddVF (Binary dst (LoadVector src2)) mask));
8980   match(Set dst (AddVD (Binary dst (LoadVector src2)) mask));
8981   format %{ "vpadd_masked $dst, $dst, $src2, $mask\t! add masked operation" %}
8982   ins_encode %{
8983     int vlen_enc = vector_length_encoding(this);
8984     BasicType bt = Matcher::vector_element_basic_type(this);
8985     int opc = this->ideal_Opcode();
8986     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
8987                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
8988   %}
8989   ins_pipe( pipe_slow );
8990 %}
8991 
8992 instruct vxor_reg_masked(vec dst, vec src2, kReg mask) %{
8993   match(Set dst (XorV (Binary dst src2) mask));
8994   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
8995   ins_encode %{
8996     int vlen_enc = vector_length_encoding(this);
8997     BasicType bt = Matcher::vector_element_basic_type(this);
8998     int opc = this->ideal_Opcode();
8999     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9000                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9001   %}
9002   ins_pipe( pipe_slow );
9003 %}
9004 
9005 instruct vxor_mem_masked(vec dst, memory src2, kReg mask) %{
9006   match(Set dst (XorV (Binary dst (LoadVector src2)) mask));
9007   format %{ "vxor_masked $dst, $dst, $src2, $mask\t! xor masked operation" %}
9008   ins_encode %{
9009     int vlen_enc = vector_length_encoding(this);
9010     BasicType bt = Matcher::vector_element_basic_type(this);
9011     int opc = this->ideal_Opcode();
9012     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9013                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9014   %}
9015   ins_pipe( pipe_slow );
9016 %}
9017 
9018 instruct vor_reg_masked(vec dst, vec src2, kReg mask) %{
9019   match(Set dst (OrV (Binary dst src2) mask));
9020   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
9021   ins_encode %{
9022     int vlen_enc = vector_length_encoding(this);
9023     BasicType bt = Matcher::vector_element_basic_type(this);
9024     int opc = this->ideal_Opcode();
9025     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9026                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9027   %}
9028   ins_pipe( pipe_slow );
9029 %}
9030 
9031 instruct vor_mem_masked(vec dst, memory src2, kReg mask) %{
9032   match(Set dst (OrV (Binary dst (LoadVector src2)) mask));
9033   format %{ "vor_masked $dst, $dst, $src2, $mask\t! or masked operation" %}
9034   ins_encode %{
9035     int vlen_enc = vector_length_encoding(this);
9036     BasicType bt = Matcher::vector_element_basic_type(this);
9037     int opc = this->ideal_Opcode();
9038     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9039                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9040   %}
9041   ins_pipe( pipe_slow );
9042 %}
9043 
9044 instruct vand_reg_masked(vec dst, vec src2, kReg mask) %{
9045   match(Set dst (AndV (Binary dst src2) mask));
9046   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
9047   ins_encode %{
9048     int vlen_enc = vector_length_encoding(this);
9049     BasicType bt = Matcher::vector_element_basic_type(this);
9050     int opc = this->ideal_Opcode();
9051     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9052                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9053   %}
9054   ins_pipe( pipe_slow );
9055 %}
9056 
9057 instruct vand_mem_masked(vec dst, memory src2, kReg mask) %{
9058   match(Set dst (AndV (Binary dst (LoadVector src2)) mask));
9059   format %{ "vand_masked $dst, $dst, $src2, $mask\t! and masked operation" %}
9060   ins_encode %{
9061     int vlen_enc = vector_length_encoding(this);
9062     BasicType bt = Matcher::vector_element_basic_type(this);
9063     int opc = this->ideal_Opcode();
9064     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9065                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9066   %}
9067   ins_pipe( pipe_slow );
9068 %}
9069 
9070 instruct vsub_reg_masked(vec dst, vec src2, kReg mask) %{
9071   match(Set dst (SubVB (Binary dst src2) mask));
9072   match(Set dst (SubVS (Binary dst src2) mask));
9073   match(Set dst (SubVI (Binary dst src2) mask));
9074   match(Set dst (SubVL (Binary dst src2) mask));
9075   match(Set dst (SubVF (Binary dst src2) mask));
9076   match(Set dst (SubVD (Binary dst src2) mask));
9077   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
9078   ins_encode %{
9079     int vlen_enc = vector_length_encoding(this);
9080     BasicType bt = Matcher::vector_element_basic_type(this);
9081     int opc = this->ideal_Opcode();
9082     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9083                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9084   %}
9085   ins_pipe( pipe_slow );
9086 %}
9087 
9088 instruct vsub_mem_masked(vec dst, memory src2, kReg mask) %{
9089   match(Set dst (SubVB (Binary dst (LoadVector src2)) mask));
9090   match(Set dst (SubVS (Binary dst (LoadVector src2)) mask));
9091   match(Set dst (SubVI (Binary dst (LoadVector src2)) mask));
9092   match(Set dst (SubVL (Binary dst (LoadVector src2)) mask));
9093   match(Set dst (SubVF (Binary dst (LoadVector src2)) mask));
9094   match(Set dst (SubVD (Binary dst (LoadVector src2)) mask));
9095   format %{ "vpsub_masked $dst, $dst, $src2, $mask\t! sub masked operation" %}
9096   ins_encode %{
9097     int vlen_enc = vector_length_encoding(this);
9098     BasicType bt = Matcher::vector_element_basic_type(this);
9099     int opc = this->ideal_Opcode();
9100     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9101                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9102   %}
9103   ins_pipe( pipe_slow );
9104 %}
9105 
9106 instruct vmul_reg_masked(vec dst, vec src2, kReg mask) %{
9107   match(Set dst (MulVS (Binary dst src2) mask));
9108   match(Set dst (MulVI (Binary dst src2) mask));
9109   match(Set dst (MulVL (Binary dst src2) mask));
9110   match(Set dst (MulVF (Binary dst src2) mask));
9111   match(Set dst (MulVD (Binary dst src2) mask));
9112   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
9113   ins_encode %{
9114     int vlen_enc = vector_length_encoding(this);
9115     BasicType bt = Matcher::vector_element_basic_type(this);
9116     int opc = this->ideal_Opcode();
9117     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9118                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9119   %}
9120   ins_pipe( pipe_slow );
9121 %}
9122 
9123 instruct vmul_mem_masked(vec dst, memory src2, kReg mask) %{
9124   match(Set dst (MulVS (Binary dst (LoadVector src2)) mask));
9125   match(Set dst (MulVI (Binary dst (LoadVector src2)) mask));
9126   match(Set dst (MulVL (Binary dst (LoadVector src2)) mask));
9127   match(Set dst (MulVF (Binary dst (LoadVector src2)) mask));
9128   match(Set dst (MulVD (Binary dst (LoadVector src2)) mask));
9129   format %{ "vpmul_masked $dst, $dst, $src2, $mask\t! mul masked operation" %}
9130   ins_encode %{
9131     int vlen_enc = vector_length_encoding(this);
9132     BasicType bt = Matcher::vector_element_basic_type(this);
9133     int opc = this->ideal_Opcode();
9134     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9135                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9136   %}
9137   ins_pipe( pipe_slow );
9138 %}
9139 
9140 instruct vsqrt_reg_masked(vec dst, kReg mask) %{
9141   match(Set dst (SqrtVF dst mask));
9142   match(Set dst (SqrtVD dst mask));
9143   ins_cost(100);
9144   format %{ "vpsqrt_masked $dst, $mask\t! sqrt masked operation" %}
9145   ins_encode %{
9146     int vlen_enc = vector_length_encoding(this);
9147     BasicType bt = Matcher::vector_element_basic_type(this);
9148     int opc = this->ideal_Opcode();
9149     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9150                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
9151   %}
9152   ins_pipe( pipe_slow );
9153 %}
9154 
9155 instruct vdiv_reg_masked(vec dst, vec src2, kReg mask) %{
9156   match(Set dst (DivVF (Binary dst src2) mask));
9157   match(Set dst (DivVD (Binary dst src2) mask));
9158   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
9159   ins_encode %{
9160     int vlen_enc = vector_length_encoding(this);
9161     BasicType bt = Matcher::vector_element_basic_type(this);
9162     int opc = this->ideal_Opcode();
9163     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9164                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9165   %}
9166   ins_pipe( pipe_slow );
9167 %}
9168 
9169 instruct vdiv_mem_masked(vec dst, memory src2, kReg mask) %{
9170   match(Set dst (DivVF (Binary dst (LoadVector src2)) mask));
9171   match(Set dst (DivVD (Binary dst (LoadVector src2)) mask));
9172   format %{ "vpdiv_masked $dst, $dst, $src2, $mask\t! div masked operation" %}
9173   ins_encode %{
9174     int vlen_enc = vector_length_encoding(this);
9175     BasicType bt = Matcher::vector_element_basic_type(this);
9176     int opc = this->ideal_Opcode();
9177     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9178                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9179   %}
9180   ins_pipe( pipe_slow );
9181 %}
9182 
9183 
9184 instruct vrol_imm_masked(vec dst, immI8 shift, kReg mask) %{
9185   match(Set dst (RotateLeftV (Binary dst shift) mask));
9186   match(Set dst (RotateRightV (Binary dst shift) mask));
9187   format %{ "vprotate_imm_masked $dst, $dst, $shift, $mask\t! rotate masked operation" %}
9188   ins_encode %{
9189     int vlen_enc = vector_length_encoding(this);
9190     BasicType bt = Matcher::vector_element_basic_type(this);
9191     int opc = this->ideal_Opcode();
9192     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9193                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9194   %}
9195   ins_pipe( pipe_slow );
9196 %}
9197 
9198 instruct vrol_reg_masked(vec dst, vec src2, kReg mask) %{
9199   match(Set dst (RotateLeftV (Binary dst src2) mask));
9200   match(Set dst (RotateRightV (Binary dst src2) mask));
9201   format %{ "vrotate_masked $dst, $dst, $src2, $mask\t! rotate masked operation" %}
9202   ins_encode %{
9203     int vlen_enc = vector_length_encoding(this);
9204     BasicType bt = Matcher::vector_element_basic_type(this);
9205     int opc = this->ideal_Opcode();
9206     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9207                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9208   %}
9209   ins_pipe( pipe_slow );
9210 %}
9211 
9212 instruct vlshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9213   match(Set dst (LShiftVS (Binary dst (LShiftCntV shift)) mask));
9214   match(Set dst (LShiftVI (Binary dst (LShiftCntV shift)) mask));
9215   match(Set dst (LShiftVL (Binary dst (LShiftCntV shift)) mask));
9216   format %{ "vplshift_imm_masked $dst, $dst, $shift, $mask\t! lshift masked operation" %}
9217   ins_encode %{
9218     int vlen_enc = vector_length_encoding(this);
9219     BasicType bt = Matcher::vector_element_basic_type(this);
9220     int opc = this->ideal_Opcode();
9221     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9222                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9223   %}
9224   ins_pipe( pipe_slow );
9225 %}
9226 
9227 instruct vlshift_reg_masked(vec dst, vec src2, kReg mask) %{
9228   predicate(!n->as_ShiftV()->is_var_shift());
9229   match(Set dst (LShiftVS (Binary dst src2) mask));
9230   match(Set dst (LShiftVI (Binary dst src2) mask));
9231   match(Set dst (LShiftVL (Binary dst src2) mask));
9232   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
9233   ins_encode %{
9234     int vlen_enc = vector_length_encoding(this);
9235     BasicType bt = Matcher::vector_element_basic_type(this);
9236     int opc = this->ideal_Opcode();
9237     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9238                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
9239   %}
9240   ins_pipe( pipe_slow );
9241 %}
9242 
9243 instruct vlshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
9244   predicate(n->as_ShiftV()->is_var_shift());
9245   match(Set dst (LShiftVS (Binary dst src2) mask));
9246   match(Set dst (LShiftVI (Binary dst src2) mask));
9247   match(Set dst (LShiftVL (Binary dst src2) mask));
9248   format %{ "vplshiftv_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
9249   ins_encode %{
9250     int vlen_enc = vector_length_encoding(this);
9251     BasicType bt = Matcher::vector_element_basic_type(this);
9252     int opc = this->ideal_Opcode();
9253     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9254                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
9255   %}
9256   ins_pipe( pipe_slow );
9257 %}
9258 
9259 instruct vlshift_mem_masked(vec dst, memory src2, kReg mask) %{
9260   match(Set dst (LShiftVS (Binary dst (LoadVector src2)) mask));
9261   match(Set dst (LShiftVI (Binary dst (LoadVector src2)) mask));
9262   match(Set dst (LShiftVL (Binary dst (LoadVector src2)) mask));
9263   format %{ "vplshift_masked $dst, $dst, $src2, $mask\t! lshift masked operation" %}
9264   ins_encode %{
9265     int vlen_enc = vector_length_encoding(this);
9266     BasicType bt = Matcher::vector_element_basic_type(this);
9267     int opc = this->ideal_Opcode();
9268     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9269                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9270   %}
9271   ins_pipe( pipe_slow );
9272 %}
9273 
9274 instruct vrshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9275   match(Set dst (RShiftVS (Binary dst (RShiftCntV shift)) mask));
9276   match(Set dst (RShiftVI (Binary dst (RShiftCntV shift)) mask));
9277   match(Set dst (RShiftVL (Binary dst (RShiftCntV shift)) mask));
9278   format %{ "vprshift_imm_masked $dst, $dst, $shift, $mask\t! rshift masked operation" %}
9279   ins_encode %{
9280     int vlen_enc = vector_length_encoding(this);
9281     BasicType bt = Matcher::vector_element_basic_type(this);
9282     int opc = this->ideal_Opcode();
9283     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9284                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9285   %}
9286   ins_pipe( pipe_slow );
9287 %}
9288 
9289 instruct vrshift_reg_masked(vec dst, vec src2, kReg mask) %{
9290   predicate(!n->as_ShiftV()->is_var_shift());
9291   match(Set dst (RShiftVS (Binary dst src2) mask));
9292   match(Set dst (RShiftVI (Binary dst src2) mask));
9293   match(Set dst (RShiftVL (Binary dst src2) mask));
9294   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
9295   ins_encode %{
9296     int vlen_enc = vector_length_encoding(this);
9297     BasicType bt = Matcher::vector_element_basic_type(this);
9298     int opc = this->ideal_Opcode();
9299     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9300                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
9301   %}
9302   ins_pipe( pipe_slow );
9303 %}
9304 
9305 instruct vrshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
9306   predicate(n->as_ShiftV()->is_var_shift());
9307   match(Set dst (RShiftVS (Binary dst src2) mask));
9308   match(Set dst (RShiftVI (Binary dst src2) mask));
9309   match(Set dst (RShiftVL (Binary dst src2) mask));
9310   format %{ "vprshiftv_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
9311   ins_encode %{
9312     int vlen_enc = vector_length_encoding(this);
9313     BasicType bt = Matcher::vector_element_basic_type(this);
9314     int opc = this->ideal_Opcode();
9315     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9316                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
9317   %}
9318   ins_pipe( pipe_slow );
9319 %}
9320 
9321 instruct vrshift_mem_masked(vec dst, memory src2, kReg mask) %{
9322   match(Set dst (RShiftVS (Binary dst (LoadVector src2)) mask));
9323   match(Set dst (RShiftVI (Binary dst (LoadVector src2)) mask));
9324   match(Set dst (RShiftVL (Binary dst (LoadVector src2)) mask));
9325   format %{ "vprshift_masked $dst, $dst, $src2, $mask\t! rshift masked operation" %}
9326   ins_encode %{
9327     int vlen_enc = vector_length_encoding(this);
9328     BasicType bt = Matcher::vector_element_basic_type(this);
9329     int opc = this->ideal_Opcode();
9330     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9331                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9332   %}
9333   ins_pipe( pipe_slow );
9334 %}
9335 
9336 instruct vurshift_imm_masked(vec dst, immI8 shift, kReg mask) %{
9337   match(Set dst (URShiftVS (Binary dst (RShiftCntV shift)) mask));
9338   match(Set dst (URShiftVI (Binary dst (RShiftCntV shift)) mask));
9339   match(Set dst (URShiftVL (Binary dst (RShiftCntV shift)) mask));
9340   format %{ "vpurshift_imm_masked $dst, $dst, $shift, $mask\t! urshift masked operation" %}
9341   ins_encode %{
9342     int vlen_enc = vector_length_encoding(this);
9343     BasicType bt = Matcher::vector_element_basic_type(this);
9344     int opc = this->ideal_Opcode();
9345     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9346                    $dst$$XMMRegister, $shift$$constant, true, vlen_enc);
9347   %}
9348   ins_pipe( pipe_slow );
9349 %}
9350 
9351 instruct vurshift_reg_masked(vec dst, vec src2, kReg mask) %{
9352   predicate(!n->as_ShiftV()->is_var_shift());
9353   match(Set dst (URShiftVS (Binary dst src2) mask));
9354   match(Set dst (URShiftVI (Binary dst src2) mask));
9355   match(Set dst (URShiftVL (Binary dst src2) mask));
9356   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
9357   ins_encode %{
9358     int vlen_enc = vector_length_encoding(this);
9359     BasicType bt = Matcher::vector_element_basic_type(this);
9360     int opc = this->ideal_Opcode();
9361     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9362                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, false);
9363   %}
9364   ins_pipe( pipe_slow );
9365 %}
9366 
9367 instruct vurshiftv_reg_masked(vec dst, vec src2, kReg mask) %{
9368   predicate(n->as_ShiftV()->is_var_shift());
9369   match(Set dst (URShiftVS (Binary dst src2) mask));
9370   match(Set dst (URShiftVI (Binary dst src2) mask));
9371   match(Set dst (URShiftVL (Binary dst src2) mask));
9372   format %{ "vpurshiftv_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
9373   ins_encode %{
9374     int vlen_enc = vector_length_encoding(this);
9375     BasicType bt = Matcher::vector_element_basic_type(this);
9376     int opc = this->ideal_Opcode();
9377     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9378                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc, true);
9379   %}
9380   ins_pipe( pipe_slow );
9381 %}
9382 
9383 instruct vurshift_mem_masked(vec dst, memory src2, kReg mask) %{
9384   match(Set dst (URShiftVS (Binary dst (LoadVector src2)) mask));
9385   match(Set dst (URShiftVI (Binary dst (LoadVector src2)) mask));
9386   match(Set dst (URShiftVL (Binary dst (LoadVector src2)) mask));
9387   format %{ "vpurshift_masked $dst, $dst, $src2, $mask\t! urshift masked operation" %}
9388   ins_encode %{
9389     int vlen_enc = vector_length_encoding(this);
9390     BasicType bt = Matcher::vector_element_basic_type(this);
9391     int opc = this->ideal_Opcode();
9392     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9393                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9394   %}
9395   ins_pipe( pipe_slow );
9396 %}
9397 
9398 instruct vmaxv_reg_masked(vec dst, vec src2, kReg mask) %{
9399   match(Set dst (MaxV (Binary dst src2) mask));
9400   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
9401   ins_encode %{
9402     int vlen_enc = vector_length_encoding(this);
9403     BasicType bt = Matcher::vector_element_basic_type(this);
9404     int opc = this->ideal_Opcode();
9405     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9406                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9407   %}
9408   ins_pipe( pipe_slow );
9409 %}
9410 
9411 instruct vmaxv_mem_masked(vec dst, memory src2, kReg mask) %{
9412   match(Set dst (MaxV (Binary dst (LoadVector src2)) mask));
9413   format %{ "vpmax_masked $dst, $dst, $src2, $mask\t! max masked operation" %}
9414   ins_encode %{
9415     int vlen_enc = vector_length_encoding(this);
9416     BasicType bt = Matcher::vector_element_basic_type(this);
9417     int opc = this->ideal_Opcode();
9418     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9419                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9420   %}
9421   ins_pipe( pipe_slow );
9422 %}
9423 
9424 instruct vminv_reg_masked(vec dst, vec src2, kReg mask) %{
9425   match(Set dst (MinV (Binary dst src2) mask));
9426   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
9427   ins_encode %{
9428     int vlen_enc = vector_length_encoding(this);
9429     BasicType bt = Matcher::vector_element_basic_type(this);
9430     int opc = this->ideal_Opcode();
9431     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9432                    $dst$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
9433   %}
9434   ins_pipe( pipe_slow );
9435 %}
9436 
9437 instruct vminv_mem_masked(vec dst, memory src2, kReg mask) %{
9438   match(Set dst (MinV (Binary dst (LoadVector src2)) mask));
9439   format %{ "vpmin_masked $dst, $dst, $src2, $mask\t! min masked operation" %}
9440   ins_encode %{
9441     int vlen_enc = vector_length_encoding(this);
9442     BasicType bt = Matcher::vector_element_basic_type(this);
9443     int opc = this->ideal_Opcode();
9444     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9445                    $dst$$XMMRegister, $src2$$Address, true, vlen_enc);
9446   %}
9447   ins_pipe( pipe_slow );
9448 %}
9449 
9450 instruct vrearrangev_reg_masked(vec dst, vec src2, kReg mask) %{
9451   match(Set dst (VectorRearrange (Binary dst src2) mask));
9452   format %{ "vprearrange_masked $dst, $dst, $src2, $mask\t! rearrange masked operation" %}
9453   ins_encode %{
9454     int vlen_enc = vector_length_encoding(this);
9455     BasicType bt = Matcher::vector_element_basic_type(this);
9456     int opc = this->ideal_Opcode();
9457     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9458                    $dst$$XMMRegister, $src2$$XMMRegister, false, vlen_enc);
9459   %}
9460   ins_pipe( pipe_slow );
9461 %}
9462 
9463 instruct vabs_masked(vec dst, kReg mask) %{
9464   match(Set dst (AbsVB dst mask));
9465   match(Set dst (AbsVS dst mask));
9466   match(Set dst (AbsVI dst mask));
9467   match(Set dst (AbsVL dst mask));
9468   format %{ "vabs_masked $dst, $mask \t! vabs masked operation" %}
9469   ins_cost(100);
9470   ins_encode %{
9471     int vlen_enc = vector_length_encoding(this);
9472     BasicType bt = Matcher::vector_element_basic_type(this);
9473     int opc = this->ideal_Opcode();
9474     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9475                    $dst$$XMMRegister, $dst$$XMMRegister, true, vlen_enc);
9476   %}
9477   ins_pipe( pipe_slow );
9478 %}
9479 
9480 instruct vfma_reg_masked(vec dst, vec src2, vec src3, kReg mask) %{
9481   match(Set dst (FmaVF (Binary dst src2) (Binary src3 mask)));
9482   match(Set dst (FmaVD (Binary dst src2) (Binary src3 mask)));
9483   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
9484   ins_encode %{
9485     int vlen_enc = vector_length_encoding(this);
9486     BasicType bt = Matcher::vector_element_basic_type(this);
9487     int opc = this->ideal_Opcode();
9488     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9489                    $src2$$XMMRegister, $src3$$XMMRegister, true, vlen_enc);
9490   %}
9491   ins_pipe( pipe_slow );
9492 %}
9493 
9494 instruct vfma_mem_masked(vec dst, vec src2, memory src3, kReg mask) %{
9495   match(Set dst (FmaVF (Binary dst src2) (Binary (LoadVector src3) mask)));
9496   match(Set dst (FmaVD (Binary dst src2) (Binary (LoadVector src3) mask)));
9497   format %{ "vfma_masked $dst, $src2, $src3, $mask \t! vfma masked operation" %}
9498   ins_encode %{
9499     int vlen_enc = vector_length_encoding(this);
9500     BasicType bt = Matcher::vector_element_basic_type(this);
9501     int opc = this->ideal_Opcode();
9502     __ evmasked_op(opc, bt, $mask$$KRegister, $dst$$XMMRegister,
9503                    $src2$$XMMRegister, $src3$$Address, true, vlen_enc);
9504   %}
9505   ins_pipe( pipe_slow );
9506 %}
9507 
9508 instruct evcmp_masked(kReg dst, vec src1, vec src2, immI8 cond, kReg mask, rRegP scratch) %{
9509   match(Set dst (VectorMaskCmp (Binary src1 src2) (Binary cond mask)));
9510   effect(TEMP scratch);
9511   format %{ "vcmp_masked $dst, $src1, $src2, $cond, $mask\t! using $scratch as TEMP" %}
9512   ins_encode %{
9513     assert(bottom_type()->isa_vectmask(), "TypeVectMask expected");
9514     int vlen_enc = vector_length_encoding(this, $src1);
9515     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
9516 
9517     // Comparison i
9518     switch (src1_elem_bt) {
9519       case T_BYTE: {
9520         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9521         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9522         __ evpcmpb($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9523         break;
9524       }
9525       case T_SHORT: {
9526         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9527         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9528         __ evpcmpw($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9529         break;
9530       }
9531       case T_INT: {
9532         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9533         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9534         __ evpcmpd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9535         break;
9536       }
9537       case T_LONG: {
9538         bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
9539         Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
9540         __ evpcmpq($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
9541         break;
9542       }
9543       case T_FLOAT: {
9544         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
9545         __ evcmpps($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
9546         break;
9547       }
9548       case T_DOUBLE: {
9549         Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
9550         __ evcmppd($dst$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
9551         break;
9552       }
9553       default: assert(false, "%s", type2name(src1_elem_bt)); break;
9554     }
9555   %}
9556   ins_pipe( pipe_slow );
9557 %}
9558 
9559 instruct mask_all_evexI_LE32(kReg dst, rRegI src) %{
9560   predicate(Matcher::vector_length(n) <= 32);
9561   match(Set dst (MaskAll src));
9562   format %{ "mask_all_evexI_LE32 $dst, $src \t" %}
9563   ins_encode %{
9564     int mask_len = Matcher::vector_length(this);
9565     __ vector_maskall_operation($dst$$KRegister, $src$$Register, mask_len);
9566   %}
9567   ins_pipe( pipe_slow );
9568 %}
9569 
9570 #ifdef _LP64
9571 instruct mask_not_immLT8(kReg dst, kReg src, rRegI rtmp, kReg ktmp, immI_M1 cnt) %{
9572   predicate(Matcher::vector_length(n) < 8 && VM_Version::supports_avx512dq());
9573   match(Set dst (XorVMask src (MaskAll cnt)));
9574   effect(TEMP_DEF dst, TEMP rtmp, TEMP ktmp);
9575   format %{ "mask_not_LT8 $dst, $src, $cnt \t!using $ktmp and $rtmp as TEMP" %}
9576   ins_encode %{
9577     uint masklen = Matcher::vector_length(this);
9578     __ knot(masklen, $dst$$KRegister, $src$$KRegister, $ktmp$$KRegister, $rtmp$$Register);
9579   %}
9580   ins_pipe( pipe_slow );
9581 %}
9582 
9583 instruct mask_not_imm(kReg dst, kReg src, immI_M1 cnt) %{
9584   predicate((Matcher::vector_length(n) == 8 && VM_Version::supports_avx512dq()) ||
9585             (Matcher::vector_length(n) == 16) ||
9586             (Matcher::vector_length(n) > 16 && VM_Version::supports_avx512bw()));
9587   match(Set dst (XorVMask src (MaskAll cnt)));
9588   format %{ "mask_not $dst, $src, $cnt \t! mask not operation" %}
9589   ins_encode %{
9590     uint masklen = Matcher::vector_length(this);
9591     __ knot(masklen, $dst$$KRegister, $src$$KRegister);
9592   %}
9593   ins_pipe( pipe_slow );
9594 %}
9595 
9596 instruct long_to_maskLE8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp) %{
9597   predicate(n->bottom_type()->isa_vectmask() == NULL && Matcher::vector_length(n) <= 8);
9598   match(Set dst (VectorLongToMask src));
9599   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp);
9600   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp as TEMP" %}
9601   ins_encode %{
9602     int mask_len = Matcher::vector_length(this);
9603     int vec_enc  = vector_length_encoding(mask_len);
9604     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
9605                               $rtmp2$$Register, xnoreg, mask_len, vec_enc);
9606   %}
9607   ins_pipe( pipe_slow );
9608 %}
9609 
9610 
9611 instruct long_to_maskGT8_avx(vec dst, rRegL src, rRegL rtmp1, rRegL rtmp2, vec xtmp1, rFlagsReg cr) %{
9612   predicate(n->bottom_type()->isa_vectmask() == NULL && Matcher::vector_length(n) > 8);
9613   match(Set dst (VectorLongToMask src));
9614   effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, TEMP xtmp1, KILL cr);
9615   format %{ "long_to_mask_avx $dst, $src\t! using $rtmp1, $rtmp2, $xtmp1, as TEMP" %}
9616   ins_encode %{
9617     int mask_len = Matcher::vector_length(this);
9618     assert(mask_len <= 32, "invalid mask length");
9619     int vec_enc  = vector_length_encoding(mask_len);
9620     __ vector_long_to_maskvec($dst$$XMMRegister, $src$$Register, $rtmp1$$Register,
9621                               $rtmp2$$Register, $xtmp1$$XMMRegister, mask_len, vec_enc);
9622   %}
9623   ins_pipe( pipe_slow );
9624 %}
9625 
9626 instruct long_to_mask_evex(kReg dst, rRegL src) %{
9627   predicate(n->bottom_type()->isa_vectmask());
9628   match(Set dst (VectorLongToMask src));
9629   format %{ "long_to_mask_evex $dst, $src\t!" %}
9630   ins_encode %{
9631     __ kmov($dst$$KRegister, $src$$Register);
9632   %}
9633   ins_pipe( pipe_slow );
9634 %}
9635 #endif
9636 
9637 instruct mask_opers_evex(kReg dst, kReg src1, kReg src2, kReg kscratch) %{
9638   match(Set dst (AndVMask src1 src2));
9639   match(Set dst (OrVMask src1 src2));
9640   match(Set dst (XorVMask src1 src2));
9641   effect(TEMP kscratch);
9642   format %{ "mask_opers_evex $dst, $src1, $src2\t! using $kscratch as TEMP" %}
9643   ins_encode %{
9644     const MachNode* mask1 = static_cast<const MachNode*>(this->in(this->operand_index($src1)));
9645     const MachNode* mask2 = static_cast<const MachNode*>(this->in(this->operand_index($src2)));
9646     assert(0 == Type::cmp(mask1->bottom_type(), mask2->bottom_type()), "");
9647     uint masklen = Matcher::vector_length(this);
9648     masklen = (masklen < 16 && !VM_Version::supports_avx512dq()) ? 16 : masklen;
9649     __ masked_op(this->ideal_Opcode(), masklen, $dst$$KRegister, $src1$$KRegister, $src2$$KRegister);
9650   %}
9651   ins_pipe( pipe_slow );
9652 %}
9653 
9654 instruct vternlog_reg_masked(vec dst, vec src2, vec src3, immU8 func, kReg mask) %{
9655   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
9656   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
9657   ins_encode %{
9658     int vlen_enc = vector_length_encoding(this);
9659     BasicType bt = Matcher::vector_element_basic_type(this);
9660     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
9661                   $src2$$XMMRegister, $src3$$XMMRegister, true, bt, vlen_enc);
9662   %}
9663   ins_pipe( pipe_slow );
9664 %}
9665 
9666 instruct vternlogd_mem_masked(vec dst, vec src2, memory src3, immU8 func, kReg mask) %{
9667   match(Set dst (MacroLogicV dst (Binary src2 (Binary src3 (Binary func mask)))));
9668   format %{ "vternlog_masked $dst,$src2,$src3,$func,$mask\t! vternlog masked operation" %}
9669   ins_encode %{
9670     int vlen_enc = vector_length_encoding(this);
9671     BasicType bt = Matcher::vector_element_basic_type(this);
9672     __ evpternlog($dst$$XMMRegister, $func$$constant, $mask$$KRegister,
9673                   $src2$$XMMRegister, $src3$$Address, true, bt, vlen_enc);
9674   %}
9675   ins_pipe( pipe_slow );
9676 %}
9677 
9678 instruct castMM(kReg dst)
9679 %{
9680   match(Set dst (CastVV dst));
9681 
9682   size(0);
9683   format %{ "# castVV of $dst" %}
9684   ins_encode(/* empty encoding */);
9685   ins_cost(0);
9686   ins_pipe(empty);
9687 %}
9688 
9689 instruct castVV(vec dst)
9690 %{
9691   match(Set dst (CastVV dst));
9692 
9693   size(0);
9694   format %{ "# castVV of $dst" %}
9695   ins_encode(/* empty encoding */);
9696   ins_cost(0);
9697   ins_pipe(empty);
9698 %}
9699 
9700 instruct castVVLeg(legVec dst)
9701 %{
9702   match(Set dst (CastVV dst));
9703 
9704   size(0);
9705   format %{ "# castVV of $dst" %}
9706   ins_encode(/* empty encoding */);
9707   ins_cost(0);
9708   ins_pipe(empty);
9709 %}