1 //
   2 // Copyright (c) 2011, 2021, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 // AVX3 Mask Registers.
 632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
 633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
 634 
 635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
 636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
 637 
 638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
 639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
 640 
 641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
 642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
 643 
 644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
 645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
 646 
 647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
 648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
 649 
 650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
 651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
 652 
 653 
 654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 662 #ifdef _LP64
 663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 687 #endif
 688                       );
 689 
 690 alloc_class chunk2(K7, K7_H,
 691                    K6, K6_H,
 692                    K5, K5_H,
 693                    K4, K4_H,
 694                    K3, K3_H,
 695                    K2, K2_H,
 696                    K1, K1_H);
 697 
 698 reg_class  vectmask_reg(K1, K1_H,
 699                         K2, K2_H,
 700                         K3, K3_H,
 701                         K4, K4_H,
 702                         K5, K5_H,
 703                         K6, K6_H,
 704                         K7, K7_H);
 705 
 706 reg_class vectmask_reg_K1(K1, K1_H);
 707 reg_class vectmask_reg_K2(K2, K2_H);
 708 reg_class vectmask_reg_K3(K3, K3_H);
 709 reg_class vectmask_reg_K4(K4, K4_H);
 710 reg_class vectmask_reg_K5(K5, K5_H);
 711 reg_class vectmask_reg_K6(K6, K6_H);
 712 reg_class vectmask_reg_K7(K7, K7_H);
 713 
 714 // flags allocation class should be last.
 715 alloc_class chunk3(RFLAGS);
 716 
 717 
 718 // Singleton class for condition codes
 719 reg_class int_flags(RFLAGS);
 720 
 721 // Class for pre evex float registers
 722 reg_class float_reg_legacy(XMM0,
 723                     XMM1,
 724                     XMM2,
 725                     XMM3,
 726                     XMM4,
 727                     XMM5,
 728                     XMM6,
 729                     XMM7
 730 #ifdef _LP64
 731                    ,XMM8,
 732                     XMM9,
 733                     XMM10,
 734                     XMM11,
 735                     XMM12,
 736                     XMM13,
 737                     XMM14,
 738                     XMM15
 739 #endif
 740                     );
 741 
 742 // Class for evex float registers
 743 reg_class float_reg_evex(XMM0,
 744                     XMM1,
 745                     XMM2,
 746                     XMM3,
 747                     XMM4,
 748                     XMM5,
 749                     XMM6,
 750                     XMM7
 751 #ifdef _LP64
 752                    ,XMM8,
 753                     XMM9,
 754                     XMM10,
 755                     XMM11,
 756                     XMM12,
 757                     XMM13,
 758                     XMM14,
 759                     XMM15,
 760                     XMM16,
 761                     XMM17,
 762                     XMM18,
 763                     XMM19,
 764                     XMM20,
 765                     XMM21,
 766                     XMM22,
 767                     XMM23,
 768                     XMM24,
 769                     XMM25,
 770                     XMM26,
 771                     XMM27,
 772                     XMM28,
 773                     XMM29,
 774                     XMM30,
 775                     XMM31
 776 #endif
 777                     );
 778 
 779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 781 
 782 // Class for pre evex double registers
 783 reg_class double_reg_legacy(XMM0,  XMM0b,
 784                      XMM1,  XMM1b,
 785                      XMM2,  XMM2b,
 786                      XMM3,  XMM3b,
 787                      XMM4,  XMM4b,
 788                      XMM5,  XMM5b,
 789                      XMM6,  XMM6b,
 790                      XMM7,  XMM7b
 791 #ifdef _LP64
 792                     ,XMM8,  XMM8b,
 793                      XMM9,  XMM9b,
 794                      XMM10, XMM10b,
 795                      XMM11, XMM11b,
 796                      XMM12, XMM12b,
 797                      XMM13, XMM13b,
 798                      XMM14, XMM14b,
 799                      XMM15, XMM15b
 800 #endif
 801                      );
 802 
 803 // Class for evex double registers
 804 reg_class double_reg_evex(XMM0,  XMM0b,
 805                      XMM1,  XMM1b,
 806                      XMM2,  XMM2b,
 807                      XMM3,  XMM3b,
 808                      XMM4,  XMM4b,
 809                      XMM5,  XMM5b,
 810                      XMM6,  XMM6b,
 811                      XMM7,  XMM7b
 812 #ifdef _LP64
 813                     ,XMM8,  XMM8b,
 814                      XMM9,  XMM9b,
 815                      XMM10, XMM10b,
 816                      XMM11, XMM11b,
 817                      XMM12, XMM12b,
 818                      XMM13, XMM13b,
 819                      XMM14, XMM14b,
 820                      XMM15, XMM15b,
 821                      XMM16, XMM16b,
 822                      XMM17, XMM17b,
 823                      XMM18, XMM18b,
 824                      XMM19, XMM19b,
 825                      XMM20, XMM20b,
 826                      XMM21, XMM21b,
 827                      XMM22, XMM22b,
 828                      XMM23, XMM23b,
 829                      XMM24, XMM24b,
 830                      XMM25, XMM25b,
 831                      XMM26, XMM26b,
 832                      XMM27, XMM27b,
 833                      XMM28, XMM28b,
 834                      XMM29, XMM29b,
 835                      XMM30, XMM30b,
 836                      XMM31, XMM31b
 837 #endif
 838                      );
 839 
 840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 842 
 843 // Class for pre evex 32bit vector registers
 844 reg_class vectors_reg_legacy(XMM0,
 845                       XMM1,
 846                       XMM2,
 847                       XMM3,
 848                       XMM4,
 849                       XMM5,
 850                       XMM6,
 851                       XMM7
 852 #ifdef _LP64
 853                      ,XMM8,
 854                       XMM9,
 855                       XMM10,
 856                       XMM11,
 857                       XMM12,
 858                       XMM13,
 859                       XMM14,
 860                       XMM15
 861 #endif
 862                       );
 863 
 864 // Class for evex 32bit vector registers
 865 reg_class vectors_reg_evex(XMM0,
 866                       XMM1,
 867                       XMM2,
 868                       XMM3,
 869                       XMM4,
 870                       XMM5,
 871                       XMM6,
 872                       XMM7
 873 #ifdef _LP64
 874                      ,XMM8,
 875                       XMM9,
 876                       XMM10,
 877                       XMM11,
 878                       XMM12,
 879                       XMM13,
 880                       XMM14,
 881                       XMM15,
 882                       XMM16,
 883                       XMM17,
 884                       XMM18,
 885                       XMM19,
 886                       XMM20,
 887                       XMM21,
 888                       XMM22,
 889                       XMM23,
 890                       XMM24,
 891                       XMM25,
 892                       XMM26,
 893                       XMM27,
 894                       XMM28,
 895                       XMM29,
 896                       XMM30,
 897                       XMM31
 898 #endif
 899                       );
 900 
 901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 903 
 904 // Class for all 64bit vector registers
 905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 906                       XMM1,  XMM1b,
 907                       XMM2,  XMM2b,
 908                       XMM3,  XMM3b,
 909                       XMM4,  XMM4b,
 910                       XMM5,  XMM5b,
 911                       XMM6,  XMM6b,
 912                       XMM7,  XMM7b
 913 #ifdef _LP64
 914                      ,XMM8,  XMM8b,
 915                       XMM9,  XMM9b,
 916                       XMM10, XMM10b,
 917                       XMM11, XMM11b,
 918                       XMM12, XMM12b,
 919                       XMM13, XMM13b,
 920                       XMM14, XMM14b,
 921                       XMM15, XMM15b
 922 #endif
 923                       );
 924 
 925 // Class for all 64bit vector registers
 926 reg_class vectord_reg_evex(XMM0,  XMM0b,
 927                       XMM1,  XMM1b,
 928                       XMM2,  XMM2b,
 929                       XMM3,  XMM3b,
 930                       XMM4,  XMM4b,
 931                       XMM5,  XMM5b,
 932                       XMM6,  XMM6b,
 933                       XMM7,  XMM7b
 934 #ifdef _LP64
 935                      ,XMM8,  XMM8b,
 936                       XMM9,  XMM9b,
 937                       XMM10, XMM10b,
 938                       XMM11, XMM11b,
 939                       XMM12, XMM12b,
 940                       XMM13, XMM13b,
 941                       XMM14, XMM14b,
 942                       XMM15, XMM15b,
 943                       XMM16, XMM16b,
 944                       XMM17, XMM17b,
 945                       XMM18, XMM18b,
 946                       XMM19, XMM19b,
 947                       XMM20, XMM20b,
 948                       XMM21, XMM21b,
 949                       XMM22, XMM22b,
 950                       XMM23, XMM23b,
 951                       XMM24, XMM24b,
 952                       XMM25, XMM25b,
 953                       XMM26, XMM26b,
 954                       XMM27, XMM27b,
 955                       XMM28, XMM28b,
 956                       XMM29, XMM29b,
 957                       XMM30, XMM30b,
 958                       XMM31, XMM31b
 959 #endif
 960                       );
 961 
 962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 964 
 965 // Class for all 128bit vector registers
 966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 973                       XMM7,  XMM7b,  XMM7c,  XMM7d
 974 #ifdef _LP64
 975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 977                       XMM10, XMM10b, XMM10c, XMM10d,
 978                       XMM11, XMM11b, XMM11c, XMM11d,
 979                       XMM12, XMM12b, XMM12c, XMM12d,
 980                       XMM13, XMM13b, XMM13c, XMM13d,
 981                       XMM14, XMM14b, XMM14c, XMM14d,
 982                       XMM15, XMM15b, XMM15c, XMM15d
 983 #endif
 984                       );
 985 
 986 // Class for all 128bit vector registers
 987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 994                       XMM7,  XMM7b,  XMM7c,  XMM7d
 995 #ifdef _LP64
 996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 998                       XMM10, XMM10b, XMM10c, XMM10d,
 999                       XMM11, XMM11b, XMM11c, XMM11d,
1000                       XMM12, XMM12b, XMM12c, XMM12d,
1001                       XMM13, XMM13b, XMM13c, XMM13d,
1002                       XMM14, XMM14b, XMM14c, XMM14d,
1003                       XMM15, XMM15b, XMM15c, XMM15d,
1004                       XMM16, XMM16b, XMM16c, XMM16d,
1005                       XMM17, XMM17b, XMM17c, XMM17d,
1006                       XMM18, XMM18b, XMM18c, XMM18d,
1007                       XMM19, XMM19b, XMM19c, XMM19d,
1008                       XMM20, XMM20b, XMM20c, XMM20d,
1009                       XMM21, XMM21b, XMM21c, XMM21d,
1010                       XMM22, XMM22b, XMM22c, XMM22d,
1011                       XMM23, XMM23b, XMM23c, XMM23d,
1012                       XMM24, XMM24b, XMM24c, XMM24d,
1013                       XMM25, XMM25b, XMM25c, XMM25d,
1014                       XMM26, XMM26b, XMM26c, XMM26d,
1015                       XMM27, XMM27b, XMM27c, XMM27d,
1016                       XMM28, XMM28b, XMM28c, XMM28d,
1017                       XMM29, XMM29b, XMM29c, XMM29d,
1018                       XMM30, XMM30b, XMM30c, XMM30d,
1019                       XMM31, XMM31b, XMM31c, XMM31d
1020 #endif
1021                       );
1022 
1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1025 
1026 // Class for all 256bit vector registers
1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1035 #ifdef _LP64
1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
1044 #endif
1045                       );
1046 
1047 // Class for all 256bit vector registers
1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1056 #ifdef _LP64
1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1081 #endif
1082                       );
1083 
1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1086 
1087 // Class for all 512bit vector registers
1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1096 #ifdef _LP64
1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1121 #endif
1122                       );
1123 
1124 // Class for restricted 512bit vector registers
1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1133 #ifdef _LP64
1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1142 #endif
1143                       );
1144 
1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1147 
1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1149 %}
1150 
1151 
1152 //----------SOURCE BLOCK-------------------------------------------------------
1153 // This is a block of C++ code which provides values, functions, and
1154 // definitions necessary in the rest of the architecture description
1155 
1156 source_hpp %{
1157 // Header information of the source block.
1158 // Method declarations/definitions which are used outside
1159 // the ad-scope can conveniently be defined here.
1160 //
1161 // To keep related declarations/definitions/uses close together,
1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1163 
1164 #include "runtime/vm_version.hpp"
1165 
1166 class NativeJump;
1167 
1168 class CallStubImpl {
1169 
1170   //--------------------------------------------------------------
1171   //---<  Used for optimization in Compile::shorten_branches  >---
1172   //--------------------------------------------------------------
1173 
1174  public:
1175   // Size of call trampoline stub.
1176   static uint size_call_trampoline() {
1177     return 0; // no call trampolines on this platform
1178   }
1179 
1180   // number of relocations needed by a call trampoline stub
1181   static uint reloc_call_trampoline() {
1182     return 0; // no call trampolines on this platform
1183   }
1184 };
1185 
1186 class HandlerImpl {
1187 
1188  public:
1189 
1190   static int emit_exception_handler(CodeBuffer &cbuf);
1191   static int emit_deopt_handler(CodeBuffer& cbuf);
1192 
1193   static uint size_exception_handler() {
1194     // NativeCall instruction size is the same as NativeJump.
1195     // exception handler starts out as jump and can be patched to
1196     // a call be deoptimization.  (4932387)
1197     // Note that this value is also credited (in output.cpp) to
1198     // the size of the code section.
1199     return NativeJump::instruction_size;
1200   }
1201 
1202 #ifdef _LP64
1203   static uint size_deopt_handler() {
1204     // three 5 byte instructions plus one move for unreachable address.
1205     return 15+3;
1206   }
1207 #else
1208   static uint size_deopt_handler() {
1209     // NativeCall instruction size is the same as NativeJump.
1210     // exception handler starts out as jump and can be patched to
1211     // a call be deoptimization.  (4932387)
1212     // Note that this value is also credited (in output.cpp) to
1213     // the size of the code section.
1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1215   }
1216 #endif
1217 };
1218 
1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
1220   switch(bytes) {
1221     case  4: // fall-through
1222     case  8: // fall-through
1223     case 16: return Assembler::AVX_128bit;
1224     case 32: return Assembler::AVX_256bit;
1225     case 64: return Assembler::AVX_512bit;
1226 
1227     default: {
1228       ShouldNotReachHere();
1229       return Assembler::AVX_NoVec;
1230     }
1231   }
1232 }
1233 
1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
1236 }
1237 
1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
1239   uint def_idx = use->operand_index(opnd);
1240   Node* def = use->in(def_idx);
1241   return vector_length_encoding(def);
1242 }
1243 
1244 static inline bool is_unsigned_booltest_pred(int bt) {
1245   return  ((bt & BoolTest::unsigned_compare) == BoolTest::unsigned_compare);
1246 }
1247 
1248 class Node::PD {
1249 public:
1250   enum NodeFlags {
1251     Flag_intel_jcc_erratum = Node::_last_flag << 1,
1252     _last_flag             = Flag_intel_jcc_erratum
1253   };
1254 };
1255 
1256 %} // end source_hpp
1257 
1258 source %{
1259 
1260 #include "opto/addnode.hpp"
1261 #include "c2_intelJccErratum_x86.hpp"
1262 
1263 void PhaseOutput::pd_perform_mach_node_analysis() {
1264   if (VM_Version::has_intel_jcc_erratum()) {
1265     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
1266     _buf_sizes._code += extra_padding;
1267   }
1268 }
1269 
1270 int MachNode::pd_alignment_required() const {
1271   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
1272     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
1273     return IntelJccErratum::largest_jcc_size() + 1;
1274   } else {
1275     return 1;
1276   }
1277 }
1278 
1279 int MachNode::compute_padding(int current_offset) const {
1280   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
1281     Compile* C = Compile::current();
1282     PhaseOutput* output = C->output();
1283     Block* block = output->block();
1284     int index = output->index();
1285     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
1286   } else {
1287     return 0;
1288   }
1289 }
1290 
1291 // Emit exception handler code.
1292 // Stuff framesize into a register and call a VM stub routine.
1293 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1294 
1295   // Note that the code buffer's insts_mark is always relative to insts.
1296   // That's why we must use the macroassembler to generate a handler.
1297   C2_MacroAssembler _masm(&cbuf);
1298   address base = __ start_a_stub(size_exception_handler());
1299   if (base == NULL) {
1300     ciEnv::current()->record_failure("CodeCache is full");
1301     return 0;  // CodeBuffer::expand failed
1302   }
1303   int offset = __ offset();
1304   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1305   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1306   __ end_a_stub();
1307   return offset;
1308 }
1309 
1310 // Emit deopt handler code.
1311 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1312 
1313   // Note that the code buffer's insts_mark is always relative to insts.
1314   // That's why we must use the macroassembler to generate a handler.
1315   C2_MacroAssembler _masm(&cbuf);
1316   address base = __ start_a_stub(size_deopt_handler());
1317   if (base == NULL) {
1318     ciEnv::current()->record_failure("CodeCache is full");
1319     return 0;  // CodeBuffer::expand failed
1320   }
1321   int offset = __ offset();
1322 
1323 #ifdef _LP64
1324   address the_pc = (address) __ pc();
1325   Label next;
1326   // push a "the_pc" on the stack without destroying any registers
1327   // as they all may be live.
1328 
1329   // push address of "next"
1330   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1331   __ bind(next);
1332   // adjust it so it matches "the_pc"
1333   __ subptr(Address(rsp, 0), __ offset() - offset);
1334 #else
1335   InternalAddress here(__ pc());
1336   __ pushptr(here.addr());
1337 #endif
1338 
1339   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1340   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1341   __ end_a_stub();
1342   return offset;
1343 }
1344 
1345 Assembler::Width widthForType(BasicType bt) {
1346   if (bt == T_BYTE) {
1347     return Assembler::B;
1348   } else if (bt == T_SHORT) {
1349     return Assembler::W;
1350   } else if (bt == T_INT) {
1351     return Assembler::D;
1352   } else {
1353     assert(bt == T_LONG, "not a long: %s", type2name(bt));
1354     return Assembler::Q;
1355   }
1356 }
1357 
1358 //=============================================================================
1359 
1360   // Float masks come from different places depending on platform.
1361 #ifdef _LP64
1362   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1363   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1364   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1365   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1366 #else
1367   static address float_signmask()  { return (address)float_signmask_pool; }
1368   static address float_signflip()  { return (address)float_signflip_pool; }
1369   static address double_signmask() { return (address)double_signmask_pool; }
1370   static address double_signflip() { return (address)double_signflip_pool; }
1371 #endif
1372   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1373   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1374   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1375   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1376   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1377   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1378   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
1379   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1380   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1381   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1382   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1383   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1384 
1385 //=============================================================================
1386 const bool Matcher::match_rule_supported(int opcode) {
1387   if (!has_match_rule(opcode)) {
1388     return false; // no match rule present
1389   }
1390   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1391   switch (opcode) {
1392     case Op_AbsVL:
1393     case Op_StoreVectorScatter:
1394       if (UseAVX < 3) {
1395         return false;
1396       }
1397       break;
1398     case Op_PopCountI:
1399     case Op_PopCountL:
1400       if (!UsePopCountInstruction) {
1401         return false;
1402       }
1403       break;
1404     case Op_PopCountVI:
1405       if (!UsePopCountInstruction || !VM_Version::supports_avx512_vpopcntdq()) {
1406         return false;
1407       }
1408       break;
1409     case Op_MulVI:
1410       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
1411         return false;
1412       }
1413       break;
1414     case Op_MulVL:
1415       if (UseSSE < 4) { // only with SSE4_1 or AVX
1416         return false;
1417       }
1418       break;
1419     case Op_MulReductionVL:
1420       if (VM_Version::supports_avx512dq() == false) {
1421         return false;
1422       }
1423       break;
1424     case Op_AddReductionVL:
1425       if (UseSSE < 2) { // requires at least SSE2
1426         return false;
1427       }
1428       break;
1429     case Op_AbsVB:
1430     case Op_AbsVS:
1431     case Op_AbsVI:
1432     case Op_AddReductionVI:
1433     case Op_AndReductionV:
1434     case Op_OrReductionV:
1435     case Op_XorReductionV:
1436       if (UseSSE < 3) { // requires at least SSSE3
1437         return false;
1438       }
1439       break;
1440     case Op_VectorLoadShuffle:
1441     case Op_VectorRearrange:
1442     case Op_MulReductionVI:
1443       if (UseSSE < 4) { // requires at least SSE4
1444         return false;
1445       }
1446       break;
1447     case Op_SqrtVD:
1448     case Op_SqrtVF:
1449     case Op_VectorMaskCmp:
1450     case Op_VectorCastB2X:
1451     case Op_VectorCastS2X:
1452     case Op_VectorCastI2X:
1453     case Op_VectorCastL2X:
1454     case Op_VectorCastF2X:
1455     case Op_VectorCastD2X:
1456       if (UseAVX < 1) { // enabled for AVX only
1457         return false;
1458       }
1459       break;
1460     case Op_CompareAndSwapL:
1461 #ifdef _LP64
1462     case Op_CompareAndSwapP:
1463 #endif
1464       if (!VM_Version::supports_cx8()) {
1465         return false;
1466       }
1467       break;
1468     case Op_CMoveVF:
1469     case Op_CMoveVD:
1470       if (UseAVX < 1) { // enabled for AVX only
1471         return false;
1472       }
1473       break;
1474     case Op_StrIndexOf:
1475       if (!UseSSE42Intrinsics) {
1476         return false;
1477       }
1478       break;
1479     case Op_StrIndexOfChar:
1480       if (!UseSSE42Intrinsics) {
1481         return false;
1482       }
1483       break;
1484     case Op_OnSpinWait:
1485       if (VM_Version::supports_on_spin_wait() == false) {
1486         return false;
1487       }
1488       break;
1489     case Op_MulVB:
1490     case Op_LShiftVB:
1491     case Op_RShiftVB:
1492     case Op_URShiftVB:
1493     case Op_VectorInsert:
1494     case Op_VectorLoadMask:
1495     case Op_VectorStoreMask:
1496     case Op_VectorBlend:
1497       if (UseSSE < 4) {
1498         return false;
1499       }
1500       break;
1501 #ifdef _LP64
1502     case Op_MaxD:
1503     case Op_MaxF:
1504     case Op_MinD:
1505     case Op_MinF:
1506       if (UseAVX < 1) { // enabled for AVX only
1507         return false;
1508       }
1509       break;
1510 #endif
1511     case Op_CacheWB:
1512     case Op_CacheWBPreSync:
1513     case Op_CacheWBPostSync:
1514       if (!VM_Version::supports_data_cache_line_flush()) {
1515         return false;
1516       }
1517       break;
1518     case Op_ExtractB:
1519     case Op_ExtractL:
1520     case Op_ExtractI:
1521     case Op_RoundDoubleMode:
1522       if (UseSSE < 4) {
1523         return false;
1524       }
1525       break;
1526     case Op_RoundDoubleModeV:
1527       if (VM_Version::supports_avx() == false) {
1528         return false; // 128bit vroundpd is not available
1529       }
1530       break;
1531     case Op_LoadVectorGather:
1532       if (UseAVX < 2) {
1533         return false;
1534       }
1535       break;
1536     case Op_FmaVD:
1537     case Op_FmaVF:
1538       if (!UseFMA) {
1539         return false;
1540       }
1541       break;
1542     case Op_MacroLogicV:
1543       if (UseAVX < 3 || !UseVectorMacroLogic) {
1544         return false;
1545       }
1546       break;
1547 
1548     case Op_VectorCmpMasked:
1549     case Op_VectorMaskGen:
1550     case Op_LoadVectorMasked:
1551     case Op_StoreVectorMasked:
1552       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
1553         return false;
1554       }
1555       break;
1556     case Op_VectorMaskFirstTrue:
1557     case Op_VectorMaskLastTrue:
1558     case Op_VectorMaskTrueCount:
1559       if (!is_LP64 || UseAVX < 1) {
1560          return false;
1561       }
1562       break;
1563     case Op_CopySignD:
1564     case Op_CopySignF:
1565       if (UseAVX < 3 || !is_LP64)  {
1566         return false;
1567       }
1568       if (!VM_Version::supports_avx512vl()) {
1569         return false;
1570       }
1571       break;
1572 #ifndef _LP64
1573     case Op_AddReductionVF:
1574     case Op_AddReductionVD:
1575     case Op_MulReductionVF:
1576     case Op_MulReductionVD:
1577       if (UseSSE < 1) { // requires at least SSE
1578         return false;
1579       }
1580       break;
1581     case Op_MulAddVS2VI:
1582     case Op_RShiftVL:
1583     case Op_AbsVD:
1584     case Op_NegVD:
1585       if (UseSSE < 2) {
1586         return false;
1587       }
1588       break;
1589 #endif // !LP64
1590     case Op_SignumF:
1591       if (UseSSE < 1) {
1592         return false;
1593       }
1594       break;
1595     case Op_SignumD:
1596       if (UseSSE < 2) {
1597         return false;
1598       }
1599       break;
1600   }
1601   return true;  // Match rules are supported by default.
1602 }
1603 
1604 //------------------------------------------------------------------------
1605 
1606 // Identify extra cases that we might want to provide match rules for vector nodes and
1607 // other intrinsics guarded with vector length (vlen) and element type (bt).
1608 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
1609   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1610   if (!match_rule_supported(opcode)) {
1611     return false;
1612   }
1613   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
1614   //   * SSE2 supports 128bit vectors for all types;
1615   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
1616   //   * AVX2 supports 256bit vectors for all types;
1617   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
1618   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1619   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1620   // And MaxVectorSize is taken into account as well.
1621   if (!vector_size_supported(bt, vlen)) {
1622     return false;
1623   }
1624   // Special cases which require vector length follow:
1625   //   * implementation limitations
1626   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1627   //   * 128bit vroundpd instruction is present only in AVX1
1628   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1629   switch (opcode) {
1630     case Op_AbsVF:
1631     case Op_NegVF:
1632       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1633         return false; // 512bit vandps and vxorps are not available
1634       }
1635       break;
1636     case Op_AbsVD:
1637     case Op_NegVD:
1638     case Op_MulVL:
1639       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1640         return false; // 512bit vpmullq, vandpd and vxorpd are not available
1641       }
1642       break;
1643     case Op_CMoveVF:
1644       if (vlen != 8) {
1645         return false; // implementation limitation (only vcmov8F_reg is present)
1646       }
1647       break;
1648     case Op_RotateRightV:
1649     case Op_RotateLeftV:
1650       if (bt != T_INT && bt != T_LONG) {
1651         return false;
1652       } // fallthrough
1653     case Op_MacroLogicV:
1654       if (!VM_Version::supports_evex() ||
1655           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
1656         return false;
1657       }
1658       break;
1659     case Op_ClearArray:
1660     case Op_VectorMaskGen:
1661     case Op_VectorCmpMasked:
1662     case Op_LoadVectorMasked:
1663     case Op_StoreVectorMasked:
1664       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
1665         return false;
1666       }
1667       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
1668         return false;
1669       }
1670       break;
1671     case Op_CMoveVD:
1672       if (vlen != 4) {
1673         return false; // implementation limitation (only vcmov4D_reg is present)
1674       }
1675       break;
1676     case Op_MaxV:
1677     case Op_MinV:
1678       if (UseSSE < 4 && is_integral_type(bt)) {
1679         return false;
1680       }
1681       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
1682           // Float/Double intrinsics are enabled for AVX family currently.
1683           if (UseAVX == 0) {
1684             return false;
1685           }
1686           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
1687             return false;
1688           }
1689       }
1690       break;
1691     case Op_CallLeafVector:
1692       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
1693         return false;
1694       }
1695       break;
1696     case Op_AddReductionVI:
1697       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
1698         return false;
1699       }
1700       // fallthrough
1701     case Op_AndReductionV:
1702     case Op_OrReductionV:
1703     case Op_XorReductionV:
1704       if (is_subword_type(bt) && (UseSSE < 4)) {
1705         return false;
1706       }
1707 #ifndef _LP64
1708       if (bt == T_BYTE || bt == T_LONG) {
1709         return false;
1710       }
1711 #endif
1712       break;
1713 #ifndef _LP64
1714     case Op_VectorInsert:
1715       if (bt == T_LONG || bt == T_DOUBLE) {
1716         return false;
1717       }
1718       break;
1719 #endif
1720     case Op_MinReductionV:
1721     case Op_MaxReductionV:
1722       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
1723         return false;
1724       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
1725         return false;
1726       }
1727       // Float/Double intrinsics enabled for AVX family.
1728       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
1729         return false;
1730       }
1731       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
1732         return false;
1733       }
1734 #ifndef _LP64
1735       if (bt == T_BYTE || bt == T_LONG) {
1736         return false;
1737       }
1738 #endif
1739       break;
1740     case Op_VectorTest:
1741       if (UseSSE < 4) {
1742         return false; // Implementation limitation
1743       } else if (size_in_bits < 32) {
1744         return false; // Implementation limitation
1745       } else if (size_in_bits == 512 && (VM_Version::supports_avx512bw() == false)) {
1746         return false; // Implementation limitation
1747       }
1748       break;
1749     case Op_VectorLoadShuffle:
1750     case Op_VectorRearrange:
1751       if(vlen == 2) {
1752         return false; // Implementation limitation due to how shuffle is loaded
1753       } else if (size_in_bits == 256 && UseAVX < 2) {
1754         return false; // Implementation limitation
1755       } else if (bt == T_BYTE && size_in_bits > 256 && !VM_Version::supports_avx512_vbmi())  {
1756         return false; // Implementation limitation
1757       } else if (bt == T_SHORT && size_in_bits > 256 && !VM_Version::supports_avx512bw())  {
1758         return false; // Implementation limitation
1759       }
1760       break;
1761     case Op_VectorLoadMask:
1762       if (size_in_bits == 256 && UseAVX < 2) {
1763         return false; // Implementation limitation
1764       }
1765       // fallthrough
1766     case Op_VectorStoreMask:
1767       if (vlen == 2) {
1768         return false; // Implementation limitation
1769       }
1770       break;
1771     case Op_VectorCastB2X:
1772       if (size_in_bits == 256 && UseAVX < 2) {
1773         return false; // Implementation limitation
1774       }
1775       break;
1776     case Op_VectorCastS2X:
1777       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1778         return false;
1779       }
1780       break;
1781     case Op_VectorCastI2X:
1782       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1783         return false;
1784       }
1785       break;
1786     case Op_VectorCastL2X:
1787       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1788         return false;
1789       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
1790         return false;
1791       }
1792       break;
1793     case Op_VectorCastF2X:
1794     case Op_VectorCastD2X:
1795       if (is_integral_type(bt)) {
1796         // Casts from FP to integral types require special fixup logic not easily
1797         // implementable with vectors.
1798         return false; // Implementation limitation
1799       }
1800     case Op_MulReductionVI:
1801       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
1802         return false;
1803       }
1804       break;
1805     case Op_StoreVectorScatter:
1806       if(bt == T_BYTE || bt == T_SHORT) {
1807         return false;
1808       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1809         return false;
1810       }
1811       // fallthrough
1812     case Op_LoadVectorGather:
1813       if (size_in_bits == 64 ) {
1814         return false;
1815       }
1816       break;
1817     case Op_VectorMaskCmp:
1818       if (vlen < 2 || size_in_bits < 32) {
1819         return false;
1820       }
1821       break;
1822   }
1823   return true;  // Per default match rules are supported.
1824 }
1825 
1826 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
1827   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
1828   bool legacy = (generic_opnd->opcode() == LEGVEC);
1829   if (!VM_Version::supports_avx512vlbwdq() && // KNL
1830       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
1831     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
1832     return new legVecZOper();
1833   }
1834   if (legacy) {
1835     switch (ideal_reg) {
1836       case Op_VecS: return new legVecSOper();
1837       case Op_VecD: return new legVecDOper();
1838       case Op_VecX: return new legVecXOper();
1839       case Op_VecY: return new legVecYOper();
1840       case Op_VecZ: return new legVecZOper();
1841     }
1842   } else {
1843     switch (ideal_reg) {
1844       case Op_VecS: return new vecSOper();
1845       case Op_VecD: return new vecDOper();
1846       case Op_VecX: return new vecXOper();
1847       case Op_VecY: return new vecYOper();
1848       case Op_VecZ: return new vecZOper();
1849     }
1850   }
1851   ShouldNotReachHere();
1852   return NULL;
1853 }
1854 
1855 bool Matcher::is_reg2reg_move(MachNode* m) {
1856   switch (m->rule()) {
1857     case MoveVec2Leg_rule:
1858     case MoveLeg2Vec_rule:
1859     case MoveF2VL_rule:
1860     case MoveF2LEG_rule:
1861     case MoveVL2F_rule:
1862     case MoveLEG2F_rule:
1863     case MoveD2VL_rule:
1864     case MoveD2LEG_rule:
1865     case MoveVL2D_rule:
1866     case MoveLEG2D_rule:
1867       return true;
1868     default:
1869       return false;
1870   }
1871 }
1872 
1873 bool Matcher::is_generic_vector(MachOper* opnd) {
1874   switch (opnd->opcode()) {
1875     case VEC:
1876     case LEGVEC:
1877       return true;
1878     default:
1879       return false;
1880   }
1881 }
1882 
1883 //------------------------------------------------------------------------
1884 
1885 const RegMask* Matcher::predicate_reg_mask(void) {
1886   return &_VECTMASK_REG_mask;
1887 }
1888 
1889 const TypeVect* Matcher::predicate_reg_type(const Type* elemTy, int length) {
1890   return new TypeVectMask(TypeInt::BOOL, length);
1891 }
1892 
1893 // Max vector size in bytes. 0 if not supported.
1894 const int Matcher::vector_width_in_bytes(BasicType bt) {
1895   assert(is_java_primitive(bt), "only primitive type vectors");
1896   if (UseSSE < 2) return 0;
1897   // SSE2 supports 128bit vectors for all types.
1898   // AVX2 supports 256bit vectors for all types.
1899   // AVX2/EVEX supports 512bit vectors for all types.
1900   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1901   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1902   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1903     size = (UseAVX > 2) ? 64 : 32;
1904   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
1905     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
1906   // Use flag to limit vector size.
1907   size = MIN2(size,(int)MaxVectorSize);
1908   // Minimum 2 values in vector (or 4 for bytes).
1909   switch (bt) {
1910   case T_DOUBLE:
1911   case T_LONG:
1912     if (size < 16) return 0;
1913     break;
1914   case T_FLOAT:
1915   case T_INT:
1916     if (size < 8) return 0;
1917     break;
1918   case T_BOOLEAN:
1919     if (size < 4) return 0;
1920     break;
1921   case T_CHAR:
1922     if (size < 4) return 0;
1923     break;
1924   case T_BYTE:
1925     if (size < 4) return 0;
1926     break;
1927   case T_SHORT:
1928     if (size < 4) return 0;
1929     break;
1930   default:
1931     ShouldNotReachHere();
1932   }
1933   return size;
1934 }
1935 
1936 // Limits on vector size (number of elements) loaded into vector.
1937 const int Matcher::max_vector_size(const BasicType bt) {
1938   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1939 }
1940 const int Matcher::min_vector_size(const BasicType bt) {
1941   int max_size = max_vector_size(bt);
1942   // Min size which can be loaded into vector is 4 bytes.
1943   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1944   // Support for calling svml double64 vectors
1945   if (bt == T_DOUBLE) {
1946     size = 1;
1947   }
1948   return MIN2(size,max_size);
1949 }
1950 
1951 const int Matcher::scalable_vector_reg_size(const BasicType bt) {
1952   return -1;
1953 }
1954 
1955 // Vector ideal reg corresponding to specified size in bytes
1956 const uint Matcher::vector_ideal_reg(int size) {
1957   assert(MaxVectorSize >= size, "");
1958   switch(size) {
1959     case  4: return Op_VecS;
1960     case  8: return Op_VecD;
1961     case 16: return Op_VecX;
1962     case 32: return Op_VecY;
1963     case 64: return Op_VecZ;
1964   }
1965   ShouldNotReachHere();
1966   return 0;
1967 }
1968 
1969 // Check for shift by small constant as well
1970 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1971   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1972       shift->in(2)->get_int() <= 3 &&
1973       // Are there other uses besides address expressions?
1974       !matcher->is_visited(shift)) {
1975     address_visited.set(shift->_idx); // Flag as address_visited
1976     mstack.push(shift->in(2), Matcher::Visit);
1977     Node *conv = shift->in(1);
1978 #ifdef _LP64
1979     // Allow Matcher to match the rule which bypass
1980     // ConvI2L operation for an array index on LP64
1981     // if the index value is positive.
1982     if (conv->Opcode() == Op_ConvI2L &&
1983         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1984         // Are there other uses besides address expressions?
1985         !matcher->is_visited(conv)) {
1986       address_visited.set(conv->_idx); // Flag as address_visited
1987       mstack.push(conv->in(1), Matcher::Pre_Visit);
1988     } else
1989 #endif
1990       mstack.push(conv, Matcher::Pre_Visit);
1991     return true;
1992   }
1993   return false;
1994 }
1995 
1996 // This function identifies sub-graphs in which a 'load' node is
1997 // input to two different nodes, and such that it can be matched
1998 // with BMI instructions like blsi, blsr, etc.
1999 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
2000 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
2001 // refers to the same node.
2002 //
2003 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
2004 // This is a temporary solution until we make DAGs expressible in ADL.
2005 template<typename ConType>
2006 class FusedPatternMatcher {
2007   Node* _op1_node;
2008   Node* _mop_node;
2009   int _con_op;
2010 
2011   static int match_next(Node* n, int next_op, int next_op_idx) {
2012     if (n->in(1) == NULL || n->in(2) == NULL) {
2013       return -1;
2014     }
2015 
2016     if (next_op_idx == -1) { // n is commutative, try rotations
2017       if (n->in(1)->Opcode() == next_op) {
2018         return 1;
2019       } else if (n->in(2)->Opcode() == next_op) {
2020         return 2;
2021       }
2022     } else {
2023       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
2024       if (n->in(next_op_idx)->Opcode() == next_op) {
2025         return next_op_idx;
2026       }
2027     }
2028     return -1;
2029   }
2030 
2031  public:
2032   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
2033     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
2034 
2035   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
2036              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
2037              typename ConType::NativeType con_value) {
2038     if (_op1_node->Opcode() != op1) {
2039       return false;
2040     }
2041     if (_mop_node->outcnt() > 2) {
2042       return false;
2043     }
2044     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
2045     if (op1_op2_idx == -1) {
2046       return false;
2047     }
2048     // Memory operation must be the other edge
2049     int op1_mop_idx = (op1_op2_idx & 1) + 1;
2050 
2051     // Check that the mop node is really what we want
2052     if (_op1_node->in(op1_mop_idx) == _mop_node) {
2053       Node* op2_node = _op1_node->in(op1_op2_idx);
2054       if (op2_node->outcnt() > 1) {
2055         return false;
2056       }
2057       assert(op2_node->Opcode() == op2, "Should be");
2058       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
2059       if (op2_con_idx == -1) {
2060         return false;
2061       }
2062       // Memory operation must be the other edge
2063       int op2_mop_idx = (op2_con_idx & 1) + 1;
2064       // Check that the memory operation is the same node
2065       if (op2_node->in(op2_mop_idx) == _mop_node) {
2066         // Now check the constant
2067         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
2068         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
2069           return true;
2070         }
2071       }
2072     }
2073     return false;
2074   }
2075 };
2076 
2077 static bool is_bmi_pattern(Node* n, Node* m) {
2078   assert(UseBMI1Instructions, "sanity");
2079   if (n != NULL && m != NULL) {
2080     if (m->Opcode() == Op_LoadI) {
2081       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
2082       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
2083              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
2084              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
2085     } else if (m->Opcode() == Op_LoadL) {
2086       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
2087       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
2088              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
2089              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
2090     }
2091   }
2092   return false;
2093 }
2094 
2095 // Should the matcher clone input 'm' of node 'n'?
2096 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
2097   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
2098   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
2099     mstack.push(m, Visit);
2100     return true;
2101   }
2102   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
2103     mstack.push(m, Visit);           // m = ShiftCntV
2104     return true;
2105   }
2106   return false;
2107 }
2108 
2109 // Should the Matcher clone shifts on addressing modes, expecting them
2110 // to be subsumed into complex addressing expressions or compute them
2111 // into registers?
2112 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
2113   Node *off = m->in(AddPNode::Offset);
2114   if (off->is_Con()) {
2115     address_visited.test_set(m->_idx); // Flag as address_visited
2116     Node *adr = m->in(AddPNode::Address);
2117 
2118     // Intel can handle 2 adds in addressing mode
2119     // AtomicAdd is not an addressing expression.
2120     // Cheap to find it by looking for screwy base.
2121     if (adr->is_AddP() &&
2122         !adr->in(AddPNode::Base)->is_top() &&
2123         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
2124         // Are there other uses besides address expressions?
2125         !is_visited(adr)) {
2126       address_visited.set(adr->_idx); // Flag as address_visited
2127       Node *shift = adr->in(AddPNode::Offset);
2128       if (!clone_shift(shift, this, mstack, address_visited)) {
2129         mstack.push(shift, Pre_Visit);
2130       }
2131       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
2132       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
2133     } else {
2134       mstack.push(adr, Pre_Visit);
2135     }
2136 
2137     // Clone X+offset as it also folds into most addressing expressions
2138     mstack.push(off, Visit);
2139     mstack.push(m->in(AddPNode::Base), Pre_Visit);
2140     return true;
2141   } else if (clone_shift(off, this, mstack, address_visited)) {
2142     address_visited.test_set(m->_idx); // Flag as address_visited
2143     mstack.push(m->in(AddPNode::Address), Pre_Visit);
2144     mstack.push(m->in(AddPNode::Base), Pre_Visit);
2145     return true;
2146   }
2147   return false;
2148 }
2149 
2150 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
2151   switch (bt) {
2152     case BoolTest::eq:
2153       return Assembler::eq;
2154     case BoolTest::ne:
2155       return Assembler::neq;
2156     case BoolTest::le:
2157     case BoolTest::ule:
2158       return Assembler::le;
2159     case BoolTest::ge:
2160     case BoolTest::uge:
2161       return Assembler::nlt;
2162     case BoolTest::lt:
2163     case BoolTest::ult:
2164       return Assembler::lt;
2165     case BoolTest::gt:
2166     case BoolTest::ugt:
2167       return Assembler::nle;
2168     default : ShouldNotReachHere(); return Assembler::_false;
2169   }
2170 }
2171 
2172 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
2173   switch (bt) {
2174   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
2175   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
2176   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
2177   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
2178   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
2179   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
2180   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
2181   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
2182   }
2183 }
2184 
2185 // Helper methods for MachSpillCopyNode::implementation().
2186 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
2187                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
2188   assert(ireg == Op_VecS || // 32bit vector
2189          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
2190          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
2191          "no non-adjacent vector moves" );
2192   if (cbuf) {
2193     C2_MacroAssembler _masm(cbuf);
2194     switch (ireg) {
2195     case Op_VecS: // copy whole register
2196     case Op_VecD:
2197     case Op_VecX:
2198 #ifndef _LP64
2199       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2200 #else
2201       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2202         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2203       } else {
2204         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2205      }
2206 #endif
2207       break;
2208     case Op_VecY:
2209 #ifndef _LP64
2210       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2211 #else
2212       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2213         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2214       } else {
2215         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2216      }
2217 #endif
2218       break;
2219     case Op_VecZ:
2220       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
2221       break;
2222     default:
2223       ShouldNotReachHere();
2224     }
2225 #ifndef PRODUCT
2226   } else {
2227     switch (ireg) {
2228     case Op_VecS:
2229     case Op_VecD:
2230     case Op_VecX:
2231       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2232       break;
2233     case Op_VecY:
2234     case Op_VecZ:
2235       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2236       break;
2237     default:
2238       ShouldNotReachHere();
2239     }
2240 #endif
2241   }
2242 }
2243 
2244 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
2245                      int stack_offset, int reg, uint ireg, outputStream* st) {
2246   if (cbuf) {
2247     C2_MacroAssembler _masm(cbuf);
2248     if (is_load) {
2249       switch (ireg) {
2250       case Op_VecS:
2251         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2252         break;
2253       case Op_VecD:
2254         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2255         break;
2256       case Op_VecX:
2257 #ifndef _LP64
2258         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2259 #else
2260         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2261           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2262         } else {
2263           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2264           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2265         }
2266 #endif
2267         break;
2268       case Op_VecY:
2269 #ifndef _LP64
2270         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2271 #else
2272         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2273           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2274         } else {
2275           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2276           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2277         }
2278 #endif
2279         break;
2280       case Op_VecZ:
2281         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
2282         break;
2283       default:
2284         ShouldNotReachHere();
2285       }
2286     } else { // store
2287       switch (ireg) {
2288       case Op_VecS:
2289         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2290         break;
2291       case Op_VecD:
2292         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2293         break;
2294       case Op_VecX:
2295 #ifndef _LP64
2296         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2297 #else
2298         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2299           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2300         }
2301         else {
2302           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2303         }
2304 #endif
2305         break;
2306       case Op_VecY:
2307 #ifndef _LP64
2308         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2309 #else
2310         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2311           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2312         }
2313         else {
2314           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2315         }
2316 #endif
2317         break;
2318       case Op_VecZ:
2319         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2320         break;
2321       default:
2322         ShouldNotReachHere();
2323       }
2324     }
2325 #ifndef PRODUCT
2326   } else {
2327     if (is_load) {
2328       switch (ireg) {
2329       case Op_VecS:
2330         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2331         break;
2332       case Op_VecD:
2333         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2334         break;
2335        case Op_VecX:
2336         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2337         break;
2338       case Op_VecY:
2339       case Op_VecZ:
2340         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2341         break;
2342       default:
2343         ShouldNotReachHere();
2344       }
2345     } else { // store
2346       switch (ireg) {
2347       case Op_VecS:
2348         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2349         break;
2350       case Op_VecD:
2351         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2352         break;
2353        case Op_VecX:
2354         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2355         break;
2356       case Op_VecY:
2357       case Op_VecZ:
2358         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2359         break;
2360       default:
2361         ShouldNotReachHere();
2362       }
2363     }
2364 #endif
2365   }
2366 }
2367 
2368 static inline jlong replicate8_imm(int con, int width) {
2369   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
2370   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
2371   int bit_width = width * 8;
2372   jlong val = con;
2373   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
2374   while(bit_width < 64) {
2375     val |= (val << bit_width);
2376     bit_width <<= 1;
2377   }
2378   return val;
2379 }
2380 
2381 #ifndef PRODUCT
2382   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
2383     st->print("nop \t# %d bytes pad for loops and calls", _count);
2384   }
2385 #endif
2386 
2387   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
2388     C2_MacroAssembler _masm(&cbuf);
2389     __ nop(_count);
2390   }
2391 
2392   uint MachNopNode::size(PhaseRegAlloc*) const {
2393     return _count;
2394   }
2395 
2396 #ifndef PRODUCT
2397   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2398     st->print("# breakpoint");
2399   }
2400 #endif
2401 
2402   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
2403     C2_MacroAssembler _masm(&cbuf);
2404     __ int3();
2405   }
2406 
2407   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2408     return MachNode::size(ra_);
2409   }
2410 
2411 %}
2412 
2413 encode %{
2414 
2415   enc_class call_epilog %{
2416     if (VerifyStackAtCalls) {
2417       // Check that stack depth is unchanged: find majik cookie on stack
2418       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2419       C2_MacroAssembler _masm(&cbuf);
2420       Label L;
2421       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2422       __ jccb(Assembler::equal, L);
2423       // Die if stack mismatch
2424       __ int3();
2425       __ bind(L);
2426     }
2427     if (tf()->returns_inline_type_as_fields() && !_method->is_method_handle_intrinsic()) {
2428       // An inline type is returned as fields in multiple registers.
2429       // Rax either contains an oop if the inline type is buffered or a pointer
2430       // to the corresponding InlineKlass with the lowest bit set to 1. Zero rax
2431       // if the lowest bit is set to allow C2 to use the oop after null checking.
2432       // rax &= (rax & 1) - 1
2433       C2_MacroAssembler _masm(&cbuf);
2434       __ movptr(rscratch1, rax);
2435       __ andptr(rscratch1, 0x1);
2436       __ subptr(rscratch1, 0x1);
2437       __ andptr(rax, rscratch1);
2438     }
2439   %}
2440 
2441 %}
2442 
2443 // Operands for bound floating pointer register arguments
2444 operand rxmm0() %{
2445   constraint(ALLOC_IN_RC(xmm0_reg));
2446   match(VecX);
2447   format%{%}
2448   interface(REG_INTER);
2449 %}
2450 
2451 //----------OPERANDS-----------------------------------------------------------
2452 // Operand definitions must precede instruction definitions for correct parsing
2453 // in the ADLC because operands constitute user defined types which are used in
2454 // instruction definitions.
2455 
2456 // Vectors
2457 
2458 // Dummy generic vector class. Should be used for all vector operands.
2459 // Replaced with vec[SDXYZ] during post-selection pass.
2460 operand vec() %{
2461   constraint(ALLOC_IN_RC(dynamic));
2462   match(VecX);
2463   match(VecY);
2464   match(VecZ);
2465   match(VecS);
2466   match(VecD);
2467 
2468   format %{ %}
2469   interface(REG_INTER);
2470 %}
2471 
2472 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
2473 // Replaced with legVec[SDXYZ] during post-selection cleanup.
2474 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
2475 // runtime code generation via reg_class_dynamic.
2476 operand legVec() %{
2477   constraint(ALLOC_IN_RC(dynamic));
2478   match(VecX);
2479   match(VecY);
2480   match(VecZ);
2481   match(VecS);
2482   match(VecD);
2483 
2484   format %{ %}
2485   interface(REG_INTER);
2486 %}
2487 
2488 // Replaces vec during post-selection cleanup. See above.
2489 operand vecS() %{
2490   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
2491   match(VecS);
2492 
2493   format %{ %}
2494   interface(REG_INTER);
2495 %}
2496 
2497 // Replaces legVec during post-selection cleanup. See above.
2498 operand legVecS() %{
2499   constraint(ALLOC_IN_RC(vectors_reg_legacy));
2500   match(VecS);
2501 
2502   format %{ %}
2503   interface(REG_INTER);
2504 %}
2505 
2506 // Replaces vec during post-selection cleanup. See above.
2507 operand vecD() %{
2508   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
2509   match(VecD);
2510 
2511   format %{ %}
2512   interface(REG_INTER);
2513 %}
2514 
2515 // Replaces legVec during post-selection cleanup. See above.
2516 operand legVecD() %{
2517   constraint(ALLOC_IN_RC(vectord_reg_legacy));
2518   match(VecD);
2519 
2520   format %{ %}
2521   interface(REG_INTER);
2522 %}
2523 
2524 // Replaces vec during post-selection cleanup. See above.
2525 operand vecX() %{
2526   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
2527   match(VecX);
2528 
2529   format %{ %}
2530   interface(REG_INTER);
2531 %}
2532 
2533 // Replaces legVec during post-selection cleanup. See above.
2534 operand legVecX() %{
2535   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
2536   match(VecX);
2537 
2538   format %{ %}
2539   interface(REG_INTER);
2540 %}
2541 
2542 // Replaces vec during post-selection cleanup. See above.
2543 operand vecY() %{
2544   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
2545   match(VecY);
2546 
2547   format %{ %}
2548   interface(REG_INTER);
2549 %}
2550 
2551 // Replaces legVec during post-selection cleanup. See above.
2552 operand legVecY() %{
2553   constraint(ALLOC_IN_RC(vectory_reg_legacy));
2554   match(VecY);
2555 
2556   format %{ %}
2557   interface(REG_INTER);
2558 %}
2559 
2560 // Replaces vec during post-selection cleanup. See above.
2561 operand vecZ() %{
2562   constraint(ALLOC_IN_RC(vectorz_reg));
2563   match(VecZ);
2564 
2565   format %{ %}
2566   interface(REG_INTER);
2567 %}
2568 
2569 // Replaces legVec during post-selection cleanup. See above.
2570 operand legVecZ() %{
2571   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
2572   match(VecZ);
2573 
2574   format %{ %}
2575   interface(REG_INTER);
2576 %}
2577 
2578 // Comparison Code for FP conditional move
2579 operand cmpOp_vcmppd() %{
2580   match(Bool);
2581 
2582   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2583             n->as_Bool()->_test._test != BoolTest::no_overflow);
2584   format %{ "" %}
2585   interface(COND_INTER) %{
2586     equal        (0x0, "eq");
2587     less         (0x1, "lt");
2588     less_equal   (0x2, "le");
2589     not_equal    (0xC, "ne");
2590     greater_equal(0xD, "ge");
2591     greater      (0xE, "gt");
2592     //TODO cannot compile (adlc breaks) without two next lines with error:
2593     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2594     // equal' for overflow.
2595     overflow     (0x20, "o");  // not really supported by the instruction
2596     no_overflow  (0x21, "no"); // not really supported by the instruction
2597   %}
2598 %}
2599 
2600 
2601 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2602 
2603 // ============================================================================
2604 
2605 instruct ShouldNotReachHere() %{
2606   match(Halt);
2607   format %{ "stop\t# ShouldNotReachHere" %}
2608   ins_encode %{
2609     if (is_reachable()) {
2610       __ stop(_halt_reason);
2611     }
2612   %}
2613   ins_pipe(pipe_slow);
2614 %}
2615 
2616 // =================================EVEX special===============================
2617 // Existing partial implementation for post-loop multi-versioning computes
2618 // the mask corresponding to tail loop in K1 opmask register. This may then be
2619 // used for predicating instructions in loop body during last post-loop iteration.
2620 // TODO: Remove hard-coded K1 usage while fixing existing post-loop
2621 // multiversioning support.
2622 instruct setMask(rRegI dst, rRegI src, kReg_K1 mask) %{
2623   predicate(PostLoopMultiversioning && Matcher::has_predicated_vectors());
2624   match(Set dst (SetVectMaskI  src));
2625   effect(TEMP dst);
2626   format %{ "setvectmask   $dst, $src" %}
2627   ins_encode %{
2628     __ setvectmask($dst$$Register, $src$$Register, $mask$$KRegister);
2629   %}
2630   ins_pipe(pipe_slow);
2631 %}
2632 
2633 // ============================================================================
2634 
2635 instruct addF_reg(regF dst, regF src) %{
2636   predicate((UseSSE>=1) && (UseAVX == 0));
2637   match(Set dst (AddF dst src));
2638 
2639   format %{ "addss   $dst, $src" %}
2640   ins_cost(150);
2641   ins_encode %{
2642     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2643   %}
2644   ins_pipe(pipe_slow);
2645 %}
2646 
2647 instruct addF_mem(regF dst, memory src) %{
2648   predicate((UseSSE>=1) && (UseAVX == 0));
2649   match(Set dst (AddF dst (LoadF src)));
2650 
2651   format %{ "addss   $dst, $src" %}
2652   ins_cost(150);
2653   ins_encode %{
2654     __ addss($dst$$XMMRegister, $src$$Address);
2655   %}
2656   ins_pipe(pipe_slow);
2657 %}
2658 
2659 instruct addF_imm(regF dst, immF con) %{
2660   predicate((UseSSE>=1) && (UseAVX == 0));
2661   match(Set dst (AddF dst con));
2662   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2663   ins_cost(150);
2664   ins_encode %{
2665     __ addss($dst$$XMMRegister, $constantaddress($con));
2666   %}
2667   ins_pipe(pipe_slow);
2668 %}
2669 
2670 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2671   predicate(UseAVX > 0);
2672   match(Set dst (AddF src1 src2));
2673 
2674   format %{ "vaddss  $dst, $src1, $src2" %}
2675   ins_cost(150);
2676   ins_encode %{
2677     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2678   %}
2679   ins_pipe(pipe_slow);
2680 %}
2681 
2682 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2683   predicate(UseAVX > 0);
2684   match(Set dst (AddF src1 (LoadF src2)));
2685 
2686   format %{ "vaddss  $dst, $src1, $src2" %}
2687   ins_cost(150);
2688   ins_encode %{
2689     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2690   %}
2691   ins_pipe(pipe_slow);
2692 %}
2693 
2694 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2695   predicate(UseAVX > 0);
2696   match(Set dst (AddF src con));
2697 
2698   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2699   ins_cost(150);
2700   ins_encode %{
2701     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2702   %}
2703   ins_pipe(pipe_slow);
2704 %}
2705 
2706 instruct addD_reg(regD dst, regD src) %{
2707   predicate((UseSSE>=2) && (UseAVX == 0));
2708   match(Set dst (AddD dst src));
2709 
2710   format %{ "addsd   $dst, $src" %}
2711   ins_cost(150);
2712   ins_encode %{
2713     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2714   %}
2715   ins_pipe(pipe_slow);
2716 %}
2717 
2718 instruct addD_mem(regD dst, memory src) %{
2719   predicate((UseSSE>=2) && (UseAVX == 0));
2720   match(Set dst (AddD dst (LoadD src)));
2721 
2722   format %{ "addsd   $dst, $src" %}
2723   ins_cost(150);
2724   ins_encode %{
2725     __ addsd($dst$$XMMRegister, $src$$Address);
2726   %}
2727   ins_pipe(pipe_slow);
2728 %}
2729 
2730 instruct addD_imm(regD dst, immD con) %{
2731   predicate((UseSSE>=2) && (UseAVX == 0));
2732   match(Set dst (AddD dst con));
2733   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2734   ins_cost(150);
2735   ins_encode %{
2736     __ addsd($dst$$XMMRegister, $constantaddress($con));
2737   %}
2738   ins_pipe(pipe_slow);
2739 %}
2740 
2741 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2742   predicate(UseAVX > 0);
2743   match(Set dst (AddD src1 src2));
2744 
2745   format %{ "vaddsd  $dst, $src1, $src2" %}
2746   ins_cost(150);
2747   ins_encode %{
2748     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2749   %}
2750   ins_pipe(pipe_slow);
2751 %}
2752 
2753 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2754   predicate(UseAVX > 0);
2755   match(Set dst (AddD src1 (LoadD src2)));
2756 
2757   format %{ "vaddsd  $dst, $src1, $src2" %}
2758   ins_cost(150);
2759   ins_encode %{
2760     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2761   %}
2762   ins_pipe(pipe_slow);
2763 %}
2764 
2765 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2766   predicate(UseAVX > 0);
2767   match(Set dst (AddD src con));
2768 
2769   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2770   ins_cost(150);
2771   ins_encode %{
2772     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2773   %}
2774   ins_pipe(pipe_slow);
2775 %}
2776 
2777 instruct subF_reg(regF dst, regF src) %{
2778   predicate((UseSSE>=1) && (UseAVX == 0));
2779   match(Set dst (SubF dst src));
2780 
2781   format %{ "subss   $dst, $src" %}
2782   ins_cost(150);
2783   ins_encode %{
2784     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2785   %}
2786   ins_pipe(pipe_slow);
2787 %}
2788 
2789 instruct subF_mem(regF dst, memory src) %{
2790   predicate((UseSSE>=1) && (UseAVX == 0));
2791   match(Set dst (SubF dst (LoadF src)));
2792 
2793   format %{ "subss   $dst, $src" %}
2794   ins_cost(150);
2795   ins_encode %{
2796     __ subss($dst$$XMMRegister, $src$$Address);
2797   %}
2798   ins_pipe(pipe_slow);
2799 %}
2800 
2801 instruct subF_imm(regF dst, immF con) %{
2802   predicate((UseSSE>=1) && (UseAVX == 0));
2803   match(Set dst (SubF dst con));
2804   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2805   ins_cost(150);
2806   ins_encode %{
2807     __ subss($dst$$XMMRegister, $constantaddress($con));
2808   %}
2809   ins_pipe(pipe_slow);
2810 %}
2811 
2812 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2813   predicate(UseAVX > 0);
2814   match(Set dst (SubF src1 src2));
2815 
2816   format %{ "vsubss  $dst, $src1, $src2" %}
2817   ins_cost(150);
2818   ins_encode %{
2819     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2820   %}
2821   ins_pipe(pipe_slow);
2822 %}
2823 
2824 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2825   predicate(UseAVX > 0);
2826   match(Set dst (SubF src1 (LoadF src2)));
2827 
2828   format %{ "vsubss  $dst, $src1, $src2" %}
2829   ins_cost(150);
2830   ins_encode %{
2831     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2832   %}
2833   ins_pipe(pipe_slow);
2834 %}
2835 
2836 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2837   predicate(UseAVX > 0);
2838   match(Set dst (SubF src con));
2839 
2840   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2841   ins_cost(150);
2842   ins_encode %{
2843     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2844   %}
2845   ins_pipe(pipe_slow);
2846 %}
2847 
2848 instruct subD_reg(regD dst, regD src) %{
2849   predicate((UseSSE>=2) && (UseAVX == 0));
2850   match(Set dst (SubD dst src));
2851 
2852   format %{ "subsd   $dst, $src" %}
2853   ins_cost(150);
2854   ins_encode %{
2855     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2856   %}
2857   ins_pipe(pipe_slow);
2858 %}
2859 
2860 instruct subD_mem(regD dst, memory src) %{
2861   predicate((UseSSE>=2) && (UseAVX == 0));
2862   match(Set dst (SubD dst (LoadD src)));
2863 
2864   format %{ "subsd   $dst, $src" %}
2865   ins_cost(150);
2866   ins_encode %{
2867     __ subsd($dst$$XMMRegister, $src$$Address);
2868   %}
2869   ins_pipe(pipe_slow);
2870 %}
2871 
2872 instruct subD_imm(regD dst, immD con) %{
2873   predicate((UseSSE>=2) && (UseAVX == 0));
2874   match(Set dst (SubD dst con));
2875   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2876   ins_cost(150);
2877   ins_encode %{
2878     __ subsd($dst$$XMMRegister, $constantaddress($con));
2879   %}
2880   ins_pipe(pipe_slow);
2881 %}
2882 
2883 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2884   predicate(UseAVX > 0);
2885   match(Set dst (SubD src1 src2));
2886 
2887   format %{ "vsubsd  $dst, $src1, $src2" %}
2888   ins_cost(150);
2889   ins_encode %{
2890     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2891   %}
2892   ins_pipe(pipe_slow);
2893 %}
2894 
2895 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2896   predicate(UseAVX > 0);
2897   match(Set dst (SubD src1 (LoadD src2)));
2898 
2899   format %{ "vsubsd  $dst, $src1, $src2" %}
2900   ins_cost(150);
2901   ins_encode %{
2902     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2903   %}
2904   ins_pipe(pipe_slow);
2905 %}
2906 
2907 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2908   predicate(UseAVX > 0);
2909   match(Set dst (SubD src con));
2910 
2911   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2912   ins_cost(150);
2913   ins_encode %{
2914     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2915   %}
2916   ins_pipe(pipe_slow);
2917 %}
2918 
2919 instruct mulF_reg(regF dst, regF src) %{
2920   predicate((UseSSE>=1) && (UseAVX == 0));
2921   match(Set dst (MulF dst src));
2922 
2923   format %{ "mulss   $dst, $src" %}
2924   ins_cost(150);
2925   ins_encode %{
2926     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2927   %}
2928   ins_pipe(pipe_slow);
2929 %}
2930 
2931 instruct mulF_mem(regF dst, memory src) %{
2932   predicate((UseSSE>=1) && (UseAVX == 0));
2933   match(Set dst (MulF dst (LoadF src)));
2934 
2935   format %{ "mulss   $dst, $src" %}
2936   ins_cost(150);
2937   ins_encode %{
2938     __ mulss($dst$$XMMRegister, $src$$Address);
2939   %}
2940   ins_pipe(pipe_slow);
2941 %}
2942 
2943 instruct mulF_imm(regF dst, immF con) %{
2944   predicate((UseSSE>=1) && (UseAVX == 0));
2945   match(Set dst (MulF dst con));
2946   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2947   ins_cost(150);
2948   ins_encode %{
2949     __ mulss($dst$$XMMRegister, $constantaddress($con));
2950   %}
2951   ins_pipe(pipe_slow);
2952 %}
2953 
2954 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2955   predicate(UseAVX > 0);
2956   match(Set dst (MulF src1 src2));
2957 
2958   format %{ "vmulss  $dst, $src1, $src2" %}
2959   ins_cost(150);
2960   ins_encode %{
2961     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2962   %}
2963   ins_pipe(pipe_slow);
2964 %}
2965 
2966 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2967   predicate(UseAVX > 0);
2968   match(Set dst (MulF src1 (LoadF src2)));
2969 
2970   format %{ "vmulss  $dst, $src1, $src2" %}
2971   ins_cost(150);
2972   ins_encode %{
2973     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2974   %}
2975   ins_pipe(pipe_slow);
2976 %}
2977 
2978 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2979   predicate(UseAVX > 0);
2980   match(Set dst (MulF src con));
2981 
2982   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2983   ins_cost(150);
2984   ins_encode %{
2985     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2986   %}
2987   ins_pipe(pipe_slow);
2988 %}
2989 
2990 instruct mulD_reg(regD dst, regD src) %{
2991   predicate((UseSSE>=2) && (UseAVX == 0));
2992   match(Set dst (MulD dst src));
2993 
2994   format %{ "mulsd   $dst, $src" %}
2995   ins_cost(150);
2996   ins_encode %{
2997     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2998   %}
2999   ins_pipe(pipe_slow);
3000 %}
3001 
3002 instruct mulD_mem(regD dst, memory src) %{
3003   predicate((UseSSE>=2) && (UseAVX == 0));
3004   match(Set dst (MulD dst (LoadD src)));
3005 
3006   format %{ "mulsd   $dst, $src" %}
3007   ins_cost(150);
3008   ins_encode %{
3009     __ mulsd($dst$$XMMRegister, $src$$Address);
3010   %}
3011   ins_pipe(pipe_slow);
3012 %}
3013 
3014 instruct mulD_imm(regD dst, immD con) %{
3015   predicate((UseSSE>=2) && (UseAVX == 0));
3016   match(Set dst (MulD dst con));
3017   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3018   ins_cost(150);
3019   ins_encode %{
3020     __ mulsd($dst$$XMMRegister, $constantaddress($con));
3021   %}
3022   ins_pipe(pipe_slow);
3023 %}
3024 
3025 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
3026   predicate(UseAVX > 0);
3027   match(Set dst (MulD src1 src2));
3028 
3029   format %{ "vmulsd  $dst, $src1, $src2" %}
3030   ins_cost(150);
3031   ins_encode %{
3032     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3033   %}
3034   ins_pipe(pipe_slow);
3035 %}
3036 
3037 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
3038   predicate(UseAVX > 0);
3039   match(Set dst (MulD src1 (LoadD src2)));
3040 
3041   format %{ "vmulsd  $dst, $src1, $src2" %}
3042   ins_cost(150);
3043   ins_encode %{
3044     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3045   %}
3046   ins_pipe(pipe_slow);
3047 %}
3048 
3049 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
3050   predicate(UseAVX > 0);
3051   match(Set dst (MulD src con));
3052 
3053   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3054   ins_cost(150);
3055   ins_encode %{
3056     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3057   %}
3058   ins_pipe(pipe_slow);
3059 %}
3060 
3061 instruct divF_reg(regF dst, regF src) %{
3062   predicate((UseSSE>=1) && (UseAVX == 0));
3063   match(Set dst (DivF dst src));
3064 
3065   format %{ "divss   $dst, $src" %}
3066   ins_cost(150);
3067   ins_encode %{
3068     __ divss($dst$$XMMRegister, $src$$XMMRegister);
3069   %}
3070   ins_pipe(pipe_slow);
3071 %}
3072 
3073 instruct divF_mem(regF dst, memory src) %{
3074   predicate((UseSSE>=1) && (UseAVX == 0));
3075   match(Set dst (DivF dst (LoadF src)));
3076 
3077   format %{ "divss   $dst, $src" %}
3078   ins_cost(150);
3079   ins_encode %{
3080     __ divss($dst$$XMMRegister, $src$$Address);
3081   %}
3082   ins_pipe(pipe_slow);
3083 %}
3084 
3085 instruct divF_imm(regF dst, immF con) %{
3086   predicate((UseSSE>=1) && (UseAVX == 0));
3087   match(Set dst (DivF dst con));
3088   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3089   ins_cost(150);
3090   ins_encode %{
3091     __ divss($dst$$XMMRegister, $constantaddress($con));
3092   %}
3093   ins_pipe(pipe_slow);
3094 %}
3095 
3096 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
3097   predicate(UseAVX > 0);
3098   match(Set dst (DivF src1 src2));
3099 
3100   format %{ "vdivss  $dst, $src1, $src2" %}
3101   ins_cost(150);
3102   ins_encode %{
3103     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3104   %}
3105   ins_pipe(pipe_slow);
3106 %}
3107 
3108 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
3109   predicate(UseAVX > 0);
3110   match(Set dst (DivF src1 (LoadF src2)));
3111 
3112   format %{ "vdivss  $dst, $src1, $src2" %}
3113   ins_cost(150);
3114   ins_encode %{
3115     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3116   %}
3117   ins_pipe(pipe_slow);
3118 %}
3119 
3120 instruct divF_reg_imm(regF dst, regF src, immF con) %{
3121   predicate(UseAVX > 0);
3122   match(Set dst (DivF src con));
3123 
3124   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3125   ins_cost(150);
3126   ins_encode %{
3127     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3128   %}
3129   ins_pipe(pipe_slow);
3130 %}
3131 
3132 instruct divD_reg(regD dst, regD src) %{
3133   predicate((UseSSE>=2) && (UseAVX == 0));
3134   match(Set dst (DivD dst src));
3135 
3136   format %{ "divsd   $dst, $src" %}
3137   ins_cost(150);
3138   ins_encode %{
3139     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
3140   %}
3141   ins_pipe(pipe_slow);
3142 %}
3143 
3144 instruct divD_mem(regD dst, memory src) %{
3145   predicate((UseSSE>=2) && (UseAVX == 0));
3146   match(Set dst (DivD dst (LoadD src)));
3147 
3148   format %{ "divsd   $dst, $src" %}
3149   ins_cost(150);
3150   ins_encode %{
3151     __ divsd($dst$$XMMRegister, $src$$Address);
3152   %}
3153   ins_pipe(pipe_slow);
3154 %}
3155 
3156 instruct divD_imm(regD dst, immD con) %{
3157   predicate((UseSSE>=2) && (UseAVX == 0));
3158   match(Set dst (DivD dst con));
3159   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3160   ins_cost(150);
3161   ins_encode %{
3162     __ divsd($dst$$XMMRegister, $constantaddress($con));
3163   %}
3164   ins_pipe(pipe_slow);
3165 %}
3166 
3167 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
3168   predicate(UseAVX > 0);
3169   match(Set dst (DivD src1 src2));
3170 
3171   format %{ "vdivsd  $dst, $src1, $src2" %}
3172   ins_cost(150);
3173   ins_encode %{
3174     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3175   %}
3176   ins_pipe(pipe_slow);
3177 %}
3178 
3179 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
3180   predicate(UseAVX > 0);
3181   match(Set dst (DivD src1 (LoadD src2)));
3182 
3183   format %{ "vdivsd  $dst, $src1, $src2" %}
3184   ins_cost(150);
3185   ins_encode %{
3186     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3187   %}
3188   ins_pipe(pipe_slow);
3189 %}
3190 
3191 instruct divD_reg_imm(regD dst, regD src, immD con) %{
3192   predicate(UseAVX > 0);
3193   match(Set dst (DivD src con));
3194 
3195   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3196   ins_cost(150);
3197   ins_encode %{
3198     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3199   %}
3200   ins_pipe(pipe_slow);
3201 %}
3202 
3203 instruct absF_reg(regF dst) %{
3204   predicate((UseSSE>=1) && (UseAVX == 0));
3205   match(Set dst (AbsF dst));
3206   ins_cost(150);
3207   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
3208   ins_encode %{
3209     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
3210   %}
3211   ins_pipe(pipe_slow);
3212 %}
3213 
3214 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
3215   predicate(UseAVX > 0);
3216   match(Set dst (AbsF src));
3217   ins_cost(150);
3218   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
3219   ins_encode %{
3220     int vlen_enc = Assembler::AVX_128bit;
3221     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
3222               ExternalAddress(float_signmask()), vlen_enc);
3223   %}
3224   ins_pipe(pipe_slow);
3225 %}
3226 
3227 instruct absD_reg(regD dst) %{
3228   predicate((UseSSE>=2) && (UseAVX == 0));
3229   match(Set dst (AbsD dst));
3230   ins_cost(150);
3231   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
3232             "# abs double by sign masking" %}
3233   ins_encode %{
3234     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
3235   %}
3236   ins_pipe(pipe_slow);
3237 %}
3238 
3239 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
3240   predicate(UseAVX > 0);
3241   match(Set dst (AbsD src));
3242   ins_cost(150);
3243   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
3244             "# abs double by sign masking" %}
3245   ins_encode %{
3246     int vlen_enc = Assembler::AVX_128bit;
3247     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
3248               ExternalAddress(double_signmask()), vlen_enc);
3249   %}
3250   ins_pipe(pipe_slow);
3251 %}
3252 
3253 instruct negF_reg(regF dst) %{
3254   predicate((UseSSE>=1) && (UseAVX == 0));
3255   match(Set dst (NegF dst));
3256   ins_cost(150);
3257   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
3258   ins_encode %{
3259     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
3260   %}
3261   ins_pipe(pipe_slow);
3262 %}
3263 
3264 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
3265   predicate(UseAVX > 0);
3266   match(Set dst (NegF src));
3267   ins_cost(150);
3268   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
3269   ins_encode %{
3270     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
3271                  ExternalAddress(float_signflip()));
3272   %}
3273   ins_pipe(pipe_slow);
3274 %}
3275 
3276 instruct negD_reg(regD dst) %{
3277   predicate((UseSSE>=2) && (UseAVX == 0));
3278   match(Set dst (NegD dst));
3279   ins_cost(150);
3280   format %{ "xorpd   $dst, [0x8000000000000000]\t"
3281             "# neg double by sign flipping" %}
3282   ins_encode %{
3283     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
3284   %}
3285   ins_pipe(pipe_slow);
3286 %}
3287 
3288 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
3289   predicate(UseAVX > 0);
3290   match(Set dst (NegD src));
3291   ins_cost(150);
3292   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
3293             "# neg double by sign flipping" %}
3294   ins_encode %{
3295     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
3296                  ExternalAddress(double_signflip()));
3297   %}
3298   ins_pipe(pipe_slow);
3299 %}
3300 
3301 // sqrtss instruction needs destination register to be pre initialized for best performance
3302 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3303 instruct sqrtF_reg(regF dst) %{
3304   predicate(UseSSE>=1);
3305   match(Set dst (SqrtF dst));
3306   format %{ "sqrtss  $dst, $dst" %}
3307   ins_encode %{
3308     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
3309   %}
3310   ins_pipe(pipe_slow);
3311 %}
3312 
3313 // sqrtsd instruction needs destination register to be pre initialized for best performance
3314 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3315 instruct sqrtD_reg(regD dst) %{
3316   predicate(UseSSE>=2);
3317   match(Set dst (SqrtD dst));
3318   format %{ "sqrtsd  $dst, $dst" %}
3319   ins_encode %{
3320     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
3321   %}
3322   ins_pipe(pipe_slow);
3323 %}
3324 
3325 // ---------------------------------------- VectorReinterpret ------------------------------------
3326 
3327 instruct reinterpret(vec dst) %{
3328   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
3329   match(Set dst (VectorReinterpret dst));
3330   ins_cost(125);
3331   format %{ "vector_reinterpret $dst\t!" %}
3332   ins_encode %{
3333     // empty
3334   %}
3335   ins_pipe( pipe_slow );
3336 %}
3337 
3338 instruct reinterpret_expand(vec dst, vec src, rRegP scratch) %{
3339   predicate(UseAVX == 0 &&
3340             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3341   match(Set dst (VectorReinterpret src));
3342   ins_cost(125);
3343   effect(TEMP dst, TEMP scratch);
3344   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3345   ins_encode %{
3346     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
3347     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
3348 
3349     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
3350     if (src_vlen_in_bytes == 4) {
3351       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3352     } else {
3353       assert(src_vlen_in_bytes == 8, "");
3354       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), $scratch$$Register);
3355     }
3356     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3357   %}
3358   ins_pipe( pipe_slow );
3359 %}
3360 
3361 instruct vreinterpret_expand4(legVec dst, vec src, rRegP scratch) %{
3362   predicate(UseAVX > 0 &&
3363             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
3364             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3365   match(Set dst (VectorReinterpret src));
3366   ins_cost(125);
3367   effect(TEMP scratch);
3368   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3369   ins_encode %{
3370     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, $scratch$$Register);
3371   %}
3372   ins_pipe( pipe_slow );
3373 %}
3374 
3375 
3376 instruct vreinterpret_expand(legVec dst, vec src) %{
3377   predicate(UseAVX > 0 &&
3378             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
3379             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3380   match(Set dst (VectorReinterpret src));
3381   ins_cost(125);
3382   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
3383   ins_encode %{
3384     switch (Matcher::vector_length_in_bytes(this, $src)) {
3385       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3386       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3387       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3388       default: ShouldNotReachHere();
3389     }
3390   %}
3391   ins_pipe( pipe_slow );
3392 %}
3393 
3394 instruct reinterpret_shrink(vec dst, legVec src) %{
3395   predicate(Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
3396   match(Set dst (VectorReinterpret src));
3397   ins_cost(125);
3398   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
3399   ins_encode %{
3400     switch (Matcher::vector_length_in_bytes(this)) {
3401       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
3402       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3403       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3404       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3405       default: ShouldNotReachHere();
3406     }
3407   %}
3408   ins_pipe( pipe_slow );
3409 %}
3410 
3411 // ----------------------------------------------------------------------------------------------------
3412 
3413 #ifdef _LP64
3414 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
3415   match(Set dst (RoundDoubleMode src rmode));
3416   format %{ "roundsd $dst,$src" %}
3417   ins_cost(150);
3418   ins_encode %{
3419     assert(UseSSE >= 4, "required");
3420     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
3421   %}
3422   ins_pipe(pipe_slow);
3423 %}
3424 
3425 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
3426   match(Set dst (RoundDoubleMode (LoadD src) rmode));
3427   format %{ "roundsd $dst,$src" %}
3428   ins_cost(150);
3429   ins_encode %{
3430     assert(UseSSE >= 4, "required");
3431     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
3432   %}
3433   ins_pipe(pipe_slow);
3434 %}
3435 
3436 instruct roundD_imm(legRegD dst, immD con, immU8 rmode, rRegI scratch_reg) %{
3437   match(Set dst (RoundDoubleMode con rmode));
3438   effect(TEMP scratch_reg);
3439   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
3440   ins_cost(150);
3441   ins_encode %{
3442     assert(UseSSE >= 4, "required");
3443     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, $scratch_reg$$Register);
3444   %}
3445   ins_pipe(pipe_slow);
3446 %}
3447 
3448 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
3449   predicate(Matcher::vector_length(n) < 8);
3450   match(Set dst (RoundDoubleModeV src rmode));
3451   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
3452   ins_encode %{
3453     assert(UseAVX > 0, "required");
3454     int vlen_enc = vector_length_encoding(this);
3455     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
3456   %}
3457   ins_pipe( pipe_slow );
3458 %}
3459 
3460 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
3461   predicate(Matcher::vector_length(n) == 8);
3462   match(Set dst (RoundDoubleModeV src rmode));
3463   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
3464   ins_encode %{
3465     assert(UseAVX > 2, "required");
3466     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
3467   %}
3468   ins_pipe( pipe_slow );
3469 %}
3470 
3471 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
3472   predicate(Matcher::vector_length(n) < 8);
3473   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3474   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
3475   ins_encode %{
3476     assert(UseAVX > 0, "required");
3477     int vlen_enc = vector_length_encoding(this);
3478     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
3479   %}
3480   ins_pipe( pipe_slow );
3481 %}
3482 
3483 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
3484   predicate(Matcher::vector_length(n) == 8);
3485   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3486   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
3487   ins_encode %{
3488     assert(UseAVX > 2, "required");
3489     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
3490   %}
3491   ins_pipe( pipe_slow );
3492 %}
3493 #endif // _LP64
3494 
3495 instruct onspinwait() %{
3496   match(OnSpinWait);
3497   ins_cost(200);
3498 
3499   format %{
3500     $$template
3501     $$emit$$"pause\t! membar_onspinwait"
3502   %}
3503   ins_encode %{
3504     __ pause();
3505   %}
3506   ins_pipe(pipe_slow);
3507 %}
3508 
3509 // a * b + c
3510 instruct fmaD_reg(regD a, regD b, regD c) %{
3511   predicate(UseFMA);
3512   match(Set c (FmaD  c (Binary a b)));
3513   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
3514   ins_cost(150);
3515   ins_encode %{
3516     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3517   %}
3518   ins_pipe( pipe_slow );
3519 %}
3520 
3521 // a * b + c
3522 instruct fmaF_reg(regF a, regF b, regF c) %{
3523   predicate(UseFMA);
3524   match(Set c (FmaF  c (Binary a b)));
3525   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
3526   ins_cost(150);
3527   ins_encode %{
3528     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3529   %}
3530   ins_pipe( pipe_slow );
3531 %}
3532 
3533 // ====================VECTOR INSTRUCTIONS=====================================
3534 
3535 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
3536 instruct MoveVec2Leg(legVec dst, vec src) %{
3537   match(Set dst src);
3538   format %{ "" %}
3539   ins_encode %{
3540     ShouldNotReachHere();
3541   %}
3542   ins_pipe( fpu_reg_reg );
3543 %}
3544 
3545 instruct MoveLeg2Vec(vec dst, legVec src) %{
3546   match(Set dst src);
3547   format %{ "" %}
3548   ins_encode %{
3549     ShouldNotReachHere();
3550   %}
3551   ins_pipe( fpu_reg_reg );
3552 %}
3553 
3554 // ============================================================================
3555 
3556 // Load vectors generic operand pattern
3557 instruct loadV(vec dst, memory mem) %{
3558   match(Set dst (LoadVector mem));
3559   ins_cost(125);
3560   format %{ "load_vector $dst,$mem" %}
3561   ins_encode %{
3562     switch (Matcher::vector_length_in_bytes(this)) {
3563       case  4: __ movdl    ($dst$$XMMRegister, $mem$$Address); break;
3564       case  8: __ movq     ($dst$$XMMRegister, $mem$$Address); break;
3565       case 16: __ movdqu   ($dst$$XMMRegister, $mem$$Address); break;
3566       case 32: __ vmovdqu  ($dst$$XMMRegister, $mem$$Address); break;
3567       case 64: __ evmovdqul($dst$$XMMRegister, $mem$$Address, Assembler::AVX_512bit); break;
3568       default: ShouldNotReachHere();
3569     }
3570   %}
3571   ins_pipe( pipe_slow );
3572 %}
3573 
3574 // Store vectors generic operand pattern.
3575 instruct storeV(memory mem, vec src) %{
3576   match(Set mem (StoreVector mem src));
3577   ins_cost(145);
3578   format %{ "store_vector $mem,$src\n\t" %}
3579   ins_encode %{
3580     switch (Matcher::vector_length_in_bytes(this, $src)) {
3581       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
3582       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
3583       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
3584       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
3585       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
3586       default: ShouldNotReachHere();
3587     }
3588   %}
3589   ins_pipe( pipe_slow );
3590 %}
3591 
3592 // ---------------------------------------- Gather ------------------------------------
3593 
3594 // Gather INT, LONG, FLOAT, DOUBLE
3595 
3596 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
3597   predicate(Matcher::vector_length_in_bytes(n) <= 32);
3598   match(Set dst (LoadVectorGather mem idx));
3599   effect(TEMP dst, TEMP tmp, TEMP mask);
3600   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
3601   ins_encode %{
3602     assert(UseAVX >= 2, "sanity");
3603 
3604     int vlen_enc = vector_length_encoding(this);
3605     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3606 
3607     assert(Matcher::vector_length_in_bytes(this) >= 16, "sanity");
3608     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3609 
3610     if (vlen_enc == Assembler::AVX_128bit) {
3611       __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3612     } else {
3613       __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3614     }
3615     __ lea($tmp$$Register, $mem$$Address);
3616     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
3617   %}
3618   ins_pipe( pipe_slow );
3619 %}
3620 
3621 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
3622   predicate(Matcher::vector_length_in_bytes(n) == 64);
3623   match(Set dst (LoadVectorGather mem idx));
3624   effect(TEMP dst, TEMP tmp, TEMP ktmp);
3625   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and k2 as TEMP" %}
3626   ins_encode %{
3627     assert(UseAVX > 2, "sanity");
3628 
3629     int vlen_enc = vector_length_encoding(this);
3630     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3631 
3632     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3633 
3634     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3635     __ lea($tmp$$Register, $mem$$Address);
3636     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
3637   %}
3638   ins_pipe( pipe_slow );
3639 %}
3640 
3641 // ====================Scatter=======================================
3642 
3643 // Scatter INT, LONG, FLOAT, DOUBLE
3644 
3645 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
3646   predicate(UseAVX > 2);
3647   match(Set mem (StoreVectorScatter mem (Binary src idx)));
3648   effect(TEMP tmp, TEMP ktmp);
3649   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
3650   ins_encode %{
3651     int vlen_enc = vector_length_encoding(this, $src);
3652     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
3653 
3654     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
3655     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3656 
3657     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3658     __ lea($tmp$$Register, $mem$$Address);
3659     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
3660   %}
3661   ins_pipe( pipe_slow );
3662 %}
3663 
3664 // ====================REPLICATE=======================================
3665 
3666 // Replicate byte scalar to be vector
3667 instruct ReplB_reg(vec dst, rRegI src) %{
3668   match(Set dst (ReplicateB src));
3669   format %{ "replicateB $dst,$src" %}
3670   ins_encode %{
3671     uint vlen = Matcher::vector_length(this);
3672     if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
3673       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
3674       int vlen_enc = vector_length_encoding(this);
3675       __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
3676     } else if (VM_Version::supports_avx2()) {
3677       int vlen_enc = vector_length_encoding(this);
3678       __ movdl($dst$$XMMRegister, $src$$Register);
3679       __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3680     } else {
3681       __ movdl($dst$$XMMRegister, $src$$Register);
3682       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3683       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3684       if (vlen >= 16) {
3685         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3686         if (vlen >= 32) {
3687           assert(vlen == 32, "sanity");
3688           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3689         }
3690       }
3691     }
3692   %}
3693   ins_pipe( pipe_slow );
3694 %}
3695 
3696 instruct ReplB_mem(vec dst, memory mem) %{
3697   predicate(VM_Version::supports_avx2());
3698   match(Set dst (ReplicateB (LoadB mem)));
3699   format %{ "replicateB $dst,$mem" %}
3700   ins_encode %{
3701     int vlen_enc = vector_length_encoding(this);
3702     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
3703   %}
3704   ins_pipe( pipe_slow );
3705 %}
3706 
3707 instruct ReplB_imm(vec dst, immI con) %{
3708   match(Set dst (ReplicateB con));
3709   format %{ "replicateB $dst,$con" %}
3710   ins_encode %{
3711     uint vlen = Matcher::vector_length(this);
3712     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 1));
3713     if (vlen == 4) {
3714       __ movdl($dst$$XMMRegister, const_addr);
3715     } else {
3716       __ movq($dst$$XMMRegister, const_addr);
3717       if (vlen >= 16) {
3718         if (VM_Version::supports_avx2()) {
3719           int vlen_enc = vector_length_encoding(this);
3720           __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3721         } else {
3722           assert(vlen == 16, "sanity");
3723           __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3724         }
3725       }
3726     }
3727   %}
3728   ins_pipe( pipe_slow );
3729 %}
3730 
3731 // Replicate byte scalar zero to be vector
3732 instruct ReplB_zero(vec dst, immI_0 zero) %{
3733   match(Set dst (ReplicateB zero));
3734   format %{ "replicateB $dst,$zero" %}
3735   ins_encode %{
3736     uint vlen = Matcher::vector_length(this);
3737     if (vlen <= 16) {
3738       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3739     } else {
3740       // Use vpxor since AVX512F does not have 512bit vxorpd (requires AVX512DQ).
3741       int vlen_enc = vector_length_encoding(this);
3742       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3743     }
3744   %}
3745   ins_pipe( fpu_reg_reg );
3746 %}
3747 
3748 // ====================ReplicateS=======================================
3749 
3750 instruct ReplS_reg(vec dst, rRegI src) %{
3751   match(Set dst (ReplicateS src));
3752   format %{ "replicateS $dst,$src" %}
3753   ins_encode %{
3754     uint vlen = Matcher::vector_length(this);
3755     if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
3756       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
3757       int vlen_enc = vector_length_encoding(this);
3758       __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
3759     } else if (VM_Version::supports_avx2()) {
3760       int vlen_enc = vector_length_encoding(this);
3761       __ movdl($dst$$XMMRegister, $src$$Register);
3762       __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3763     } else {
3764       __ movdl($dst$$XMMRegister, $src$$Register);
3765       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3766       if (vlen >= 8) {
3767         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3768         if (vlen >= 16) {
3769           assert(vlen == 16, "sanity");
3770           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3771         }
3772       }
3773     }
3774   %}
3775   ins_pipe( pipe_slow );
3776 %}
3777 
3778 instruct ReplS_mem(vec dst, memory mem) %{
3779   predicate(VM_Version::supports_avx2());
3780   match(Set dst (ReplicateS (LoadS mem)));
3781   format %{ "replicateS $dst,$mem" %}
3782   ins_encode %{
3783     int vlen_enc = vector_length_encoding(this);
3784     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
3785   %}
3786   ins_pipe( pipe_slow );
3787 %}
3788 
3789 instruct ReplS_imm(vec dst, immI con) %{
3790   match(Set dst (ReplicateS con));
3791   format %{ "replicateS $dst,$con" %}
3792   ins_encode %{
3793     uint vlen = Matcher::vector_length(this);
3794     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 2));
3795     if (vlen == 2) {
3796       __ movdl($dst$$XMMRegister, const_addr);
3797     } else {
3798       __ movq($dst$$XMMRegister, const_addr);
3799       if (vlen >= 8) {
3800         if (VM_Version::supports_avx2()) {
3801           int vlen_enc = vector_length_encoding(this);
3802           __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3803         } else {
3804           assert(vlen == 8, "sanity");
3805           __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3806         }
3807       }
3808     }
3809   %}
3810   ins_pipe( fpu_reg_reg );
3811 %}
3812 
3813 instruct ReplS_zero(vec dst, immI_0 zero) %{
3814   match(Set dst (ReplicateS zero));
3815   format %{ "replicateS $dst,$zero" %}
3816   ins_encode %{
3817     uint vlen = Matcher::vector_length(this);
3818     if (vlen <= 8) {
3819       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3820     } else {
3821       int vlen_enc = vector_length_encoding(this);
3822       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3823     }
3824   %}
3825   ins_pipe( fpu_reg_reg );
3826 %}
3827 
3828 // ====================ReplicateI=======================================
3829 
3830 instruct ReplI_reg(vec dst, rRegI src) %{
3831   match(Set dst (ReplicateI src));
3832   format %{ "replicateI $dst,$src" %}
3833   ins_encode %{
3834     uint vlen = Matcher::vector_length(this);
3835     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3836       int vlen_enc = vector_length_encoding(this);
3837       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
3838     } else if (VM_Version::supports_avx2()) {
3839       int vlen_enc = vector_length_encoding(this);
3840       __ movdl($dst$$XMMRegister, $src$$Register);
3841       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3842     } else {
3843       __ movdl($dst$$XMMRegister, $src$$Register);
3844       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3845       if (vlen >= 8) {
3846         assert(vlen == 8, "sanity");
3847         __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3848       }
3849     }
3850   %}
3851   ins_pipe( pipe_slow );
3852 %}
3853 
3854 instruct ReplI_mem(vec dst, memory mem) %{
3855   match(Set dst (ReplicateI (LoadI mem)));
3856   format %{ "replicateI $dst,$mem" %}
3857   ins_encode %{
3858     uint vlen = Matcher::vector_length(this);
3859     if (vlen <= 4) {
3860       __ movdl($dst$$XMMRegister, $mem$$Address);
3861       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3862     } else {
3863       assert(VM_Version::supports_avx2(), "sanity");
3864       int vlen_enc = vector_length_encoding(this);
3865       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
3866     }
3867   %}
3868   ins_pipe( pipe_slow );
3869 %}
3870 
3871 instruct ReplI_imm(vec dst, immI con) %{
3872   match(Set dst (ReplicateI con));
3873   format %{ "replicateI $dst,$con" %}
3874   ins_encode %{
3875     uint vlen = Matcher::vector_length(this);
3876     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 4));
3877     if (vlen <= 4) {
3878       __ movq($dst$$XMMRegister, const_addr);
3879       if (vlen == 4) {
3880         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3881       }
3882     } else {
3883       assert(VM_Version::supports_avx2(), "sanity");
3884       int vlen_enc = vector_length_encoding(this);
3885       __ movq($dst$$XMMRegister, const_addr);
3886       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3887     }
3888   %}
3889   ins_pipe( pipe_slow );
3890 %}
3891 
3892 // Replicate integer (4 byte) scalar zero to be vector
3893 instruct ReplI_zero(vec dst, immI_0 zero) %{
3894   match(Set dst (ReplicateI zero));
3895   format %{ "replicateI $dst,$zero" %}
3896   ins_encode %{
3897     uint vlen = Matcher::vector_length(this);
3898     if (vlen <= 4) {
3899       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3900     } else {
3901       int vlen_enc = vector_length_encoding(this);
3902       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3903     }
3904   %}
3905   ins_pipe( fpu_reg_reg );
3906 %}
3907 
3908 instruct ReplI_M1(vec dst, immI_M1 con) %{
3909   predicate(UseAVX > 0);
3910   match(Set dst (ReplicateB con));
3911   match(Set dst (ReplicateS con));
3912   match(Set dst (ReplicateI con));
3913   effect(TEMP dst);
3914   format %{ "vallones $dst" %}
3915   ins_encode %{
3916     int vector_len = vector_length_encoding(this);
3917     __ vallones($dst$$XMMRegister, vector_len);
3918   %}
3919   ins_pipe( pipe_slow );
3920 %}
3921 
3922 // ====================ReplicateL=======================================
3923 
3924 #ifdef _LP64
3925 // Replicate long (8 byte) scalar to be vector
3926 instruct ReplL_reg(vec dst, rRegL src) %{
3927   match(Set dst (ReplicateL src));
3928   format %{ "replicateL $dst,$src" %}
3929   ins_encode %{
3930     uint vlen = Matcher::vector_length(this);
3931     if (vlen == 2) {
3932       __ movdq($dst$$XMMRegister, $src$$Register);
3933       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3934     } else if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3935       int vlen_enc = vector_length_encoding(this);
3936       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
3937     } else if (VM_Version::supports_avx2()) {
3938       assert(vlen == 4, "sanity");
3939       int vlen_enc = vector_length_encoding(this);
3940       __ movdq($dst$$XMMRegister, $src$$Register);
3941       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3942     } else {
3943       assert(vlen == 4, "sanity");
3944       __ movdq($dst$$XMMRegister, $src$$Register);
3945       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3946       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3947     }
3948   %}
3949   ins_pipe( pipe_slow );
3950 %}
3951 #else // _LP64
3952 // Replicate long (8 byte) scalar to be vector
3953 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
3954   predicate(Matcher::vector_length(n) <= 4);
3955   match(Set dst (ReplicateL src));
3956   effect(TEMP dst, USE src, TEMP tmp);
3957   format %{ "replicateL $dst,$src" %}
3958   ins_encode %{
3959     uint vlen = Matcher::vector_length(this);
3960     if (vlen == 2) {
3961       __ movdl($dst$$XMMRegister, $src$$Register);
3962       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3963       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3964       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3965     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3966       int vlen_enc = Assembler::AVX_256bit;
3967       __ movdl($dst$$XMMRegister, $src$$Register);
3968       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3969       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3970       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3971     } else {
3972       __ movdl($dst$$XMMRegister, $src$$Register);
3973       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3974       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3975       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3976       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3977     }
3978   %}
3979   ins_pipe( pipe_slow );
3980 %}
3981 
3982 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
3983   predicate(Matcher::vector_length(n) == 8);
3984   match(Set dst (ReplicateL src));
3985   effect(TEMP dst, USE src, TEMP tmp);
3986   format %{ "replicateL $dst,$src" %}
3987   ins_encode %{
3988     if (VM_Version::supports_avx512vl()) {
3989       __ movdl($dst$$XMMRegister, $src$$Register);
3990       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3991       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3992       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3993       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3994       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3995     } else {
3996       int vlen_enc = Assembler::AVX_512bit;
3997       __ movdl($dst$$XMMRegister, $src$$Register);
3998       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3999       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4000       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4001     }
4002   %}
4003   ins_pipe( pipe_slow );
4004 %}
4005 #endif // _LP64
4006 
4007 instruct ReplL_mem(vec dst, memory mem) %{
4008   match(Set dst (ReplicateL (LoadL mem)));
4009   format %{ "replicateL $dst,$mem" %}
4010   ins_encode %{
4011     uint vlen = Matcher::vector_length(this);
4012     if (vlen == 2) {
4013       __ movq($dst$$XMMRegister, $mem$$Address);
4014       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4015     } else {
4016       assert(VM_Version::supports_avx2(), "sanity");
4017       int vlen_enc = vector_length_encoding(this);
4018       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
4019     }
4020   %}
4021   ins_pipe( pipe_slow );
4022 %}
4023 
4024 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4025 instruct ReplL_imm(vec dst, immL con) %{
4026   match(Set dst (ReplicateL con));
4027   format %{ "replicateL $dst,$con" %}
4028   ins_encode %{
4029     uint vlen = Matcher::vector_length(this);
4030     InternalAddress const_addr = $constantaddress($con);
4031     if (vlen == 2) {
4032       __ movq($dst$$XMMRegister, const_addr);
4033       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4034     } else {
4035       assert(VM_Version::supports_avx2(), "sanity");
4036       int vlen_enc = vector_length_encoding(this);
4037       __ movq($dst$$XMMRegister, const_addr);
4038       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4039     }
4040   %}
4041   ins_pipe( pipe_slow );
4042 %}
4043 
4044 instruct ReplL_zero(vec dst, immL0 zero) %{
4045   match(Set dst (ReplicateL zero));
4046   format %{ "replicateL $dst,$zero" %}
4047   ins_encode %{
4048     int vlen = Matcher::vector_length(this);
4049     if (vlen == 2) {
4050       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4051     } else {
4052       int vlen_enc = vector_length_encoding(this);
4053       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4054     }
4055   %}
4056   ins_pipe( fpu_reg_reg );
4057 %}
4058 
4059 instruct ReplL_M1(vec dst, immL_M1 con) %{
4060   predicate(UseAVX > 0);
4061   match(Set dst (ReplicateL con));
4062   effect(TEMP dst);
4063   format %{ "vallones $dst" %}
4064   ins_encode %{
4065     int vector_len = vector_length_encoding(this);
4066     __ vallones($dst$$XMMRegister, vector_len);
4067   %}
4068   ins_pipe( pipe_slow );
4069 %}
4070 
4071 // ====================ReplicateF=======================================
4072 
4073 instruct ReplF_reg(vec dst, vlRegF src) %{
4074   match(Set dst (ReplicateF src));
4075   format %{ "replicateF $dst,$src" %}
4076   ins_encode %{
4077     uint vlen = Matcher::vector_length(this);
4078     if (vlen <= 4) {
4079       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4080    } else if (VM_Version::supports_avx2()) {
4081       int vlen_enc = vector_length_encoding(this);
4082       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4083     } else {
4084       assert(vlen == 8, "sanity");
4085       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4086       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4087     }
4088   %}
4089   ins_pipe( pipe_slow );
4090 %}
4091 
4092 instruct ReplF_mem(vec dst, memory mem) %{
4093   match(Set dst (ReplicateF (LoadF mem)));
4094   format %{ "replicateF $dst,$mem" %}
4095   ins_encode %{
4096     uint vlen = Matcher::vector_length(this);
4097     if (vlen <= 4) {
4098       __ movdl($dst$$XMMRegister, $mem$$Address);
4099       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4100     } else {
4101       assert(VM_Version::supports_avx(), "sanity");
4102       int vlen_enc = vector_length_encoding(this);
4103       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
4104     }
4105   %}
4106   ins_pipe( pipe_slow );
4107 %}
4108 
4109 instruct ReplF_zero(vec dst, immF0 zero) %{
4110   match(Set dst (ReplicateF zero));
4111   format %{ "replicateF $dst,$zero" %}
4112   ins_encode %{
4113     uint vlen = Matcher::vector_length(this);
4114     if (vlen <= 4) {
4115       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4116     } else {
4117       int vlen_enc = vector_length_encoding(this);
4118       __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4119     }
4120   %}
4121   ins_pipe( fpu_reg_reg );
4122 %}
4123 
4124 // ====================ReplicateD=======================================
4125 
4126 // Replicate double (8 bytes) scalar to be vector
4127 instruct ReplD_reg(vec dst, vlRegD src) %{
4128   match(Set dst (ReplicateD src));
4129   format %{ "replicateD $dst,$src" %}
4130   ins_encode %{
4131     uint vlen = Matcher::vector_length(this);
4132     if (vlen == 2) {
4133       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4134     } else if (VM_Version::supports_avx2()) {
4135       int vlen_enc = vector_length_encoding(this);
4136       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4137     } else {
4138       assert(vlen == 4, "sanity");
4139       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4140       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4141     }
4142   %}
4143   ins_pipe( pipe_slow );
4144 %}
4145 
4146 instruct ReplD_mem(vec dst, memory mem) %{
4147   match(Set dst (ReplicateD (LoadD mem)));
4148   format %{ "replicateD $dst,$mem" %}
4149   ins_encode %{
4150     uint vlen = Matcher::vector_length(this);
4151     if (vlen == 2) {
4152       __ movq($dst$$XMMRegister, $mem$$Address);
4153       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44);
4154     } else {
4155       assert(VM_Version::supports_avx(), "sanity");
4156       int vlen_enc = vector_length_encoding(this);
4157       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4158     }
4159   %}
4160   ins_pipe( pipe_slow );
4161 %}
4162 
4163 instruct ReplD_zero(vec dst, immD0 zero) %{
4164   match(Set dst (ReplicateD zero));
4165   format %{ "replicateD $dst,$zero" %}
4166   ins_encode %{
4167     uint vlen = Matcher::vector_length(this);
4168     if (vlen == 2) {
4169       __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4170     } else {
4171       int vlen_enc = vector_length_encoding(this);
4172       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4173     }
4174   %}
4175   ins_pipe( fpu_reg_reg );
4176 %}
4177 
4178 // ====================VECTOR INSERT=======================================
4179 
4180 instruct insert(vec dst, rRegI val, immU8 idx) %{
4181   predicate(Matcher::vector_length_in_bytes(n) < 32);
4182   match(Set dst (VectorInsert (Binary dst val) idx));
4183   format %{ "vector_insert $dst,$val,$idx" %}
4184   ins_encode %{
4185     assert(UseSSE >= 4, "required");
4186     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
4187 
4188     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4189 
4190     assert(is_integral_type(elem_bt), "");
4191     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4192 
4193     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
4194   %}
4195   ins_pipe( pipe_slow );
4196 %}
4197 
4198 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
4199   predicate(Matcher::vector_length_in_bytes(n) == 32);
4200   match(Set dst (VectorInsert (Binary src val) idx));
4201   effect(TEMP vtmp);
4202   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4203   ins_encode %{
4204     int vlen_enc = Assembler::AVX_256bit;
4205     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4206     int elem_per_lane = 16/type2aelembytes(elem_bt);
4207     int log2epr = log2(elem_per_lane);
4208 
4209     assert(is_integral_type(elem_bt), "sanity");
4210     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4211 
4212     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4213     uint y_idx = ($idx$$constant >> log2epr) & 1;
4214     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4215     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4216     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4217   %}
4218   ins_pipe( pipe_slow );
4219 %}
4220 
4221 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
4222   predicate(Matcher::vector_length_in_bytes(n) == 64);
4223   match(Set dst (VectorInsert (Binary src val) idx));
4224   effect(TEMP vtmp);
4225   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4226   ins_encode %{
4227     assert(UseAVX > 2, "sanity");
4228 
4229     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4230     int elem_per_lane = 16/type2aelembytes(elem_bt);
4231     int log2epr = log2(elem_per_lane);
4232 
4233     assert(is_integral_type(elem_bt), "");
4234     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4235 
4236     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4237     uint y_idx = ($idx$$constant >> log2epr) & 3;
4238     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4239     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4240     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4241   %}
4242   ins_pipe( pipe_slow );
4243 %}
4244 
4245 #ifdef _LP64
4246 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
4247   predicate(Matcher::vector_length(n) == 2);
4248   match(Set dst (VectorInsert (Binary dst val) idx));
4249   format %{ "vector_insert $dst,$val,$idx" %}
4250   ins_encode %{
4251     assert(UseSSE >= 4, "required");
4252     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4253     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4254 
4255     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
4256   %}
4257   ins_pipe( pipe_slow );
4258 %}
4259 
4260 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
4261   predicate(Matcher::vector_length(n) == 4);
4262   match(Set dst (VectorInsert (Binary src val) idx));
4263   effect(TEMP vtmp);
4264   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4265   ins_encode %{
4266     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4267     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4268 
4269     uint x_idx = $idx$$constant & right_n_bits(1);
4270     uint y_idx = ($idx$$constant >> 1) & 1;
4271     int vlen_enc = Assembler::AVX_256bit;
4272     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4273     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4274     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4275   %}
4276   ins_pipe( pipe_slow );
4277 %}
4278 
4279 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
4280   predicate(Matcher::vector_length(n) == 8);
4281   match(Set dst (VectorInsert (Binary src val) idx));
4282   effect(TEMP vtmp);
4283   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4284   ins_encode %{
4285     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
4286     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4287 
4288     uint x_idx = $idx$$constant & right_n_bits(1);
4289     uint y_idx = ($idx$$constant >> 1) & 3;
4290     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4291     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4292     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4293   %}
4294   ins_pipe( pipe_slow );
4295 %}
4296 #endif
4297 
4298 instruct insertF(vec dst, regF val, immU8 idx) %{
4299   predicate(Matcher::vector_length(n) < 8);
4300   match(Set dst (VectorInsert (Binary dst val) idx));
4301   format %{ "vector_insert $dst,$val,$idx" %}
4302   ins_encode %{
4303     assert(UseSSE >= 4, "sanity");
4304 
4305     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4306     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4307 
4308     __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
4309   %}
4310   ins_pipe( pipe_slow );
4311 %}
4312 
4313 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
4314   predicate(Matcher::vector_length(n) >= 8);
4315   match(Set dst (VectorInsert (Binary src val) idx));
4316   effect(TEMP vtmp);
4317   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4318   ins_encode %{
4319     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4320     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4321 
4322     int vlen = Matcher::vector_length(this);
4323     uint x_idx = $idx$$constant & right_n_bits(2);
4324     if (vlen == 8) {
4325       uint y_idx = ($idx$$constant >> 2) & 1;
4326       int vlen_enc = Assembler::AVX_256bit;
4327       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4328       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4329       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4330     } else {
4331       assert(vlen == 16, "sanity");
4332       uint y_idx = ($idx$$constant >> 2) & 3;
4333       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4334       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4335       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4336     }
4337   %}
4338   ins_pipe( pipe_slow );
4339 %}
4340 
4341 #ifdef _LP64
4342 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
4343   predicate(Matcher::vector_length(n) == 2);
4344   match(Set dst (VectorInsert (Binary dst val) idx));
4345   effect(TEMP tmp);
4346   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
4347   ins_encode %{
4348     assert(UseSSE >= 4, "sanity");
4349     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4350     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4351 
4352     __ movq($tmp$$Register, $val$$XMMRegister);
4353     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
4354   %}
4355   ins_pipe( pipe_slow );
4356 %}
4357 
4358 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
4359   predicate(Matcher::vector_length(n) == 4);
4360   match(Set dst (VectorInsert (Binary src val) idx));
4361   effect(TEMP vtmp, TEMP tmp);
4362   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
4363   ins_encode %{
4364     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4365     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4366 
4367     uint x_idx = $idx$$constant & right_n_bits(1);
4368     uint y_idx = ($idx$$constant >> 1) & 1;
4369     int vlen_enc = Assembler::AVX_256bit;
4370     __ movq($tmp$$Register, $val$$XMMRegister);
4371     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4372     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4373     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4374   %}
4375   ins_pipe( pipe_slow );
4376 %}
4377 
4378 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
4379   predicate(Matcher::vector_length(n) == 8);
4380   match(Set dst (VectorInsert (Binary src val) idx));
4381   effect(TEMP tmp, TEMP vtmp);
4382   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4383   ins_encode %{
4384     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4385     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4386 
4387     uint x_idx = $idx$$constant & right_n_bits(1);
4388     uint y_idx = ($idx$$constant >> 1) & 3;
4389     __ movq($tmp$$Register, $val$$XMMRegister);
4390     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4391     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4392     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4393   %}
4394   ins_pipe( pipe_slow );
4395 %}
4396 #endif
4397 
4398 // ====================REDUCTION ARITHMETIC=======================================
4399 
4400 // =======================Int Reduction==========================================
4401 
4402 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4403   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
4404   match(Set dst (AddReductionVI src1 src2));
4405   match(Set dst (MulReductionVI src1 src2));
4406   match(Set dst (AndReductionV  src1 src2));
4407   match(Set dst ( OrReductionV  src1 src2));
4408   match(Set dst (XorReductionV  src1 src2));
4409   match(Set dst (MinReductionV  src1 src2));
4410   match(Set dst (MaxReductionV  src1 src2));
4411   effect(TEMP vtmp1, TEMP vtmp2);
4412   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4413   ins_encode %{
4414     int opcode = this->ideal_Opcode();
4415     int vlen = Matcher::vector_length(this, $src2);
4416     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4417   %}
4418   ins_pipe( pipe_slow );
4419 %}
4420 
4421 // =======================Long Reduction==========================================
4422 
4423 #ifdef _LP64
4424 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4425   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
4426   match(Set dst (AddReductionVL src1 src2));
4427   match(Set dst (MulReductionVL src1 src2));
4428   match(Set dst (AndReductionV  src1 src2));
4429   match(Set dst ( OrReductionV  src1 src2));
4430   match(Set dst (XorReductionV  src1 src2));
4431   match(Set dst (MinReductionV  src1 src2));
4432   match(Set dst (MaxReductionV  src1 src2));
4433   effect(TEMP vtmp1, TEMP vtmp2);
4434   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4435   ins_encode %{
4436     int opcode = this->ideal_Opcode();
4437     int vlen = Matcher::vector_length(this, $src2);
4438     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4439   %}
4440   ins_pipe( pipe_slow );
4441 %}
4442 
4443 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
4444   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
4445   match(Set dst (AddReductionVL src1 src2));
4446   match(Set dst (MulReductionVL src1 src2));
4447   match(Set dst (AndReductionV  src1 src2));
4448   match(Set dst ( OrReductionV  src1 src2));
4449   match(Set dst (XorReductionV  src1 src2));
4450   match(Set dst (MinReductionV  src1 src2));
4451   match(Set dst (MaxReductionV  src1 src2));
4452   effect(TEMP vtmp1, TEMP vtmp2);
4453   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4454   ins_encode %{
4455     int opcode = this->ideal_Opcode();
4456     int vlen = Matcher::vector_length(this, $src2);
4457     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4458   %}
4459   ins_pipe( pipe_slow );
4460 %}
4461 #endif // _LP64
4462 
4463 // =======================Float Reduction==========================================
4464 
4465 instruct reductionF128(regF dst, vec src, vec vtmp) %{
4466   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
4467   match(Set dst (AddReductionVF dst src));
4468   match(Set dst (MulReductionVF dst src));
4469   effect(TEMP dst, TEMP vtmp);
4470   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
4471   ins_encode %{
4472     int opcode = this->ideal_Opcode();
4473     int vlen = Matcher::vector_length(this, $src);
4474     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4475   %}
4476   ins_pipe( pipe_slow );
4477 %}
4478 
4479 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
4480   predicate(Matcher::vector_length(n->in(2)) == 8); // src
4481   match(Set dst (AddReductionVF dst src));
4482   match(Set dst (MulReductionVF dst src));
4483   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4484   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4485   ins_encode %{
4486     int opcode = this->ideal_Opcode();
4487     int vlen = Matcher::vector_length(this, $src);
4488     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4489   %}
4490   ins_pipe( pipe_slow );
4491 %}
4492 
4493 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4494   predicate(Matcher::vector_length(n->in(2)) == 16); // src
4495   match(Set dst (AddReductionVF dst src));
4496   match(Set dst (MulReductionVF dst src));
4497   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4498   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4499   ins_encode %{
4500     int opcode = this->ideal_Opcode();
4501     int vlen = Matcher::vector_length(this, $src);
4502     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4503   %}
4504   ins_pipe( pipe_slow );
4505 %}
4506 
4507 // =======================Double Reduction==========================================
4508 
4509 instruct reduction2D(regD dst, vec src, vec vtmp) %{
4510   predicate(Matcher::vector_length(n->in(2)) == 2); // src
4511   match(Set dst (AddReductionVD dst src));
4512   match(Set dst (MulReductionVD dst src));
4513   effect(TEMP dst, TEMP vtmp);
4514   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
4515   ins_encode %{
4516     int opcode = this->ideal_Opcode();
4517     int vlen = Matcher::vector_length(this, $src);
4518     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4519 %}
4520   ins_pipe( pipe_slow );
4521 %}
4522 
4523 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
4524   predicate(Matcher::vector_length(n->in(2)) == 4); // src
4525   match(Set dst (AddReductionVD dst src));
4526   match(Set dst (MulReductionVD dst src));
4527   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4528   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4529   ins_encode %{
4530     int opcode = this->ideal_Opcode();
4531     int vlen = Matcher::vector_length(this, $src);
4532     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4533   %}
4534   ins_pipe( pipe_slow );
4535 %}
4536 
4537 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4538   predicate(Matcher::vector_length(n->in(2)) == 8); // src
4539   match(Set dst (AddReductionVD dst src));
4540   match(Set dst (MulReductionVD dst src));
4541   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4542   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4543   ins_encode %{
4544     int opcode = this->ideal_Opcode();
4545     int vlen = Matcher::vector_length(this, $src);
4546     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4547   %}
4548   ins_pipe( pipe_slow );
4549 %}
4550 
4551 // =======================Byte Reduction==========================================
4552 
4553 #ifdef _LP64
4554 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4555   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
4556   match(Set dst (AddReductionVI src1 src2));
4557   match(Set dst (AndReductionV  src1 src2));
4558   match(Set dst ( OrReductionV  src1 src2));
4559   match(Set dst (XorReductionV  src1 src2));
4560   match(Set dst (MinReductionV  src1 src2));
4561   match(Set dst (MaxReductionV  src1 src2));
4562   effect(TEMP vtmp1, TEMP vtmp2);
4563   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4564   ins_encode %{
4565     int opcode = this->ideal_Opcode();
4566     int vlen = Matcher::vector_length(this, $src2);
4567     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4568   %}
4569   ins_pipe( pipe_slow );
4570 %}
4571 
4572 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4573   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
4574   match(Set dst (AddReductionVI src1 src2));
4575   match(Set dst (AndReductionV  src1 src2));
4576   match(Set dst ( OrReductionV  src1 src2));
4577   match(Set dst (XorReductionV  src1 src2));
4578   match(Set dst (MinReductionV  src1 src2));
4579   match(Set dst (MaxReductionV  src1 src2));
4580   effect(TEMP vtmp1, TEMP vtmp2);
4581   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4582   ins_encode %{
4583     int opcode = this->ideal_Opcode();
4584     int vlen = Matcher::vector_length(this, $src2);
4585     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4586   %}
4587   ins_pipe( pipe_slow );
4588 %}
4589 #endif
4590 
4591 // =======================Short Reduction==========================================
4592 
4593 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4594   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
4595   match(Set dst (AddReductionVI src1 src2));
4596   match(Set dst (MulReductionVI src1 src2));
4597   match(Set dst (AndReductionV  src1 src2));
4598   match(Set dst ( OrReductionV  src1 src2));
4599   match(Set dst (XorReductionV  src1 src2));
4600   match(Set dst (MinReductionV  src1 src2));
4601   match(Set dst (MaxReductionV  src1 src2));
4602   effect(TEMP vtmp1, TEMP vtmp2);
4603   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4604   ins_encode %{
4605     int opcode = this->ideal_Opcode();
4606     int vlen = Matcher::vector_length(this, $src2);
4607     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4608   %}
4609   ins_pipe( pipe_slow );
4610 %}
4611 
4612 // =======================Mul Reduction==========================================
4613 
4614 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4615   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
4616             Matcher::vector_length(n->in(2)) <= 32); // src2
4617   match(Set dst (MulReductionVI src1 src2));
4618   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4619   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4620   ins_encode %{
4621     int opcode = this->ideal_Opcode();
4622     int vlen = Matcher::vector_length(this, $src2);
4623     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4624   %}
4625   ins_pipe( pipe_slow );
4626 %}
4627 
4628 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4629   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
4630             Matcher::vector_length(n->in(2)) == 64); // src2
4631   match(Set dst (MulReductionVI src1 src2));
4632   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4633   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4634   ins_encode %{
4635     int opcode = this->ideal_Opcode();
4636     int vlen = Matcher::vector_length(this, $src2);
4637     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4638   %}
4639   ins_pipe( pipe_slow );
4640 %}
4641 
4642 //--------------------Min/Max Float Reduction --------------------
4643 // Float Min Reduction
4644 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
4645                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4646   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4647             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4648              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4649             Matcher::vector_length(n->in(2)) == 2);
4650   match(Set dst (MinReductionV src1 src2));
4651   match(Set dst (MaxReductionV src1 src2));
4652   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4653   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4654   ins_encode %{
4655     assert(UseAVX > 0, "sanity");
4656 
4657     int opcode = this->ideal_Opcode();
4658     int vlen = Matcher::vector_length(this, $src2);
4659     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4660                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4661   %}
4662   ins_pipe( pipe_slow );
4663 %}
4664 
4665 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
4666                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4667   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4668             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4669              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4670             Matcher::vector_length(n->in(2)) >= 4);
4671   match(Set dst (MinReductionV src1 src2));
4672   match(Set dst (MaxReductionV src1 src2));
4673   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4674   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4675   ins_encode %{
4676     assert(UseAVX > 0, "sanity");
4677 
4678     int opcode = this->ideal_Opcode();
4679     int vlen = Matcher::vector_length(this, $src2);
4680     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4681                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4682   %}
4683   ins_pipe( pipe_slow );
4684 %}
4685 
4686 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
4687                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4688   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4689             Matcher::vector_length(n->in(2)) == 2);
4690   match(Set dst (MinReductionV dst src));
4691   match(Set dst (MaxReductionV dst src));
4692   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4693   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4694   ins_encode %{
4695     assert(UseAVX > 0, "sanity");
4696 
4697     int opcode = this->ideal_Opcode();
4698     int vlen = Matcher::vector_length(this, $src);
4699     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4700                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4701   %}
4702   ins_pipe( pipe_slow );
4703 %}
4704 
4705 
4706 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
4707                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4708   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4709             Matcher::vector_length(n->in(2)) >= 4);
4710   match(Set dst (MinReductionV dst src));
4711   match(Set dst (MaxReductionV dst src));
4712   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4713   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4714   ins_encode %{
4715     assert(UseAVX > 0, "sanity");
4716 
4717     int opcode = this->ideal_Opcode();
4718     int vlen = Matcher::vector_length(this, $src);
4719     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4720                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4721   %}
4722   ins_pipe( pipe_slow );
4723 %}
4724 
4725 
4726 //--------------------Min Double Reduction --------------------
4727 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
4728                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
4729                             rFlagsReg cr) %{
4730   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4731             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
4732              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
4733             Matcher::vector_length(n->in(2)) == 2);
4734   match(Set dst (MinReductionV src1 src2));
4735   match(Set dst (MaxReductionV src1 src2));
4736   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
4737   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
4738   ins_encode %{
4739     assert(UseAVX > 0, "sanity");
4740 
4741     int opcode = this->ideal_Opcode();
4742     int vlen = Matcher::vector_length(this, $src2);
4743     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
4744                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
4745   %}
4746   ins_pipe( pipe_slow );
4747 %}
4748 
4749 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
4750                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
4751                            rFlagsReg cr) %{
4752   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4753             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
4754              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
4755             Matcher::vector_length(n->in(2)) >= 4);
4756   match(Set dst (MinReductionV src1 src2));
4757   match(Set dst (MaxReductionV src1 src2));
4758   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
4759   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
4760   ins_encode %{
4761     assert(UseAVX > 0, "sanity");
4762 
4763     int opcode = this->ideal_Opcode();
4764     int vlen = Matcher::vector_length(this, $src2);
4765     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
4766                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
4767   %}
4768   ins_pipe( pipe_slow );
4769 %}
4770 
4771 
4772 instruct minmax_reduction2D_av(legRegD dst, legVec src,
4773                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
4774                                rFlagsReg cr) %{
4775   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4776             Matcher::vector_length(n->in(2)) == 2);
4777   match(Set dst (MinReductionV dst src));
4778   match(Set dst (MaxReductionV dst src));
4779   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
4780   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
4781   ins_encode %{
4782     assert(UseAVX > 0, "sanity");
4783 
4784     int opcode = this->ideal_Opcode();
4785     int vlen = Matcher::vector_length(this, $src);
4786     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
4787                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
4788   %}
4789   ins_pipe( pipe_slow );
4790 %}
4791 
4792 instruct minmax_reductionD_av(legRegD dst, legVec src,
4793                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
4794                               rFlagsReg cr) %{
4795   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4796             Matcher::vector_length(n->in(2)) >= 4);
4797   match(Set dst (MinReductionV dst src));
4798   match(Set dst (MaxReductionV dst src));
4799   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
4800   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
4801   ins_encode %{
4802     assert(UseAVX > 0, "sanity");
4803 
4804     int opcode = this->ideal_Opcode();
4805     int vlen = Matcher::vector_length(this, $src);
4806     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
4807                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
4808   %}
4809   ins_pipe( pipe_slow );
4810 %}
4811 
4812 // ====================VECTOR ARITHMETIC=======================================
4813 
4814 // --------------------------------- ADD --------------------------------------
4815 
4816 // Bytes vector add
4817 instruct vaddB(vec dst, vec src) %{
4818   predicate(UseAVX == 0);
4819   match(Set dst (AddVB dst src));
4820   format %{ "paddb   $dst,$src\t! add packedB" %}
4821   ins_encode %{
4822     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
4823   %}
4824   ins_pipe( pipe_slow );
4825 %}
4826 
4827 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
4828   predicate(UseAVX > 0);
4829   match(Set dst (AddVB src1 src2));
4830   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
4831   ins_encode %{
4832     int vlen_enc = vector_length_encoding(this);
4833     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4834   %}
4835   ins_pipe( pipe_slow );
4836 %}
4837 
4838 instruct vaddB_mem(vec dst, vec src, memory mem) %{
4839   predicate((UseAVX > 0) &&
4840             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
4841   match(Set dst (AddVB src (LoadVector mem)));
4842   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
4843   ins_encode %{
4844     int vlen_enc = vector_length_encoding(this);
4845     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4846   %}
4847   ins_pipe( pipe_slow );
4848 %}
4849 
4850 // Shorts/Chars vector add
4851 instruct vaddS(vec dst, vec src) %{
4852   predicate(UseAVX == 0);
4853   match(Set dst (AddVS dst src));
4854   format %{ "paddw   $dst,$src\t! add packedS" %}
4855   ins_encode %{
4856     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
4857   %}
4858   ins_pipe( pipe_slow );
4859 %}
4860 
4861 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
4862   predicate(UseAVX > 0);
4863   match(Set dst (AddVS src1 src2));
4864   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
4865   ins_encode %{
4866     int vlen_enc = vector_length_encoding(this);
4867     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4868   %}
4869   ins_pipe( pipe_slow );
4870 %}
4871 
4872 instruct vaddS_mem(vec dst, vec src, memory mem) %{
4873   predicate((UseAVX > 0) &&
4874             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
4875   match(Set dst (AddVS src (LoadVector mem)));
4876   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
4877   ins_encode %{
4878     int vlen_enc = vector_length_encoding(this);
4879     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4880   %}
4881   ins_pipe( pipe_slow );
4882 %}
4883 
4884 // Integers vector add
4885 instruct vaddI(vec dst, vec src) %{
4886   predicate(UseAVX == 0);
4887   match(Set dst (AddVI dst src));
4888   format %{ "paddd   $dst,$src\t! add packedI" %}
4889   ins_encode %{
4890     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
4891   %}
4892   ins_pipe( pipe_slow );
4893 %}
4894 
4895 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
4896   predicate(UseAVX > 0);
4897   match(Set dst (AddVI src1 src2));
4898   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
4899   ins_encode %{
4900     int vlen_enc = vector_length_encoding(this);
4901     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4902   %}
4903   ins_pipe( pipe_slow );
4904 %}
4905 
4906 
4907 instruct vaddI_mem(vec dst, vec src, memory mem) %{
4908   predicate((UseAVX > 0) &&
4909             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
4910   match(Set dst (AddVI src (LoadVector mem)));
4911   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
4912   ins_encode %{
4913     int vlen_enc = vector_length_encoding(this);
4914     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4915   %}
4916   ins_pipe( pipe_slow );
4917 %}
4918 
4919 // Longs vector add
4920 instruct vaddL(vec dst, vec src) %{
4921   predicate(UseAVX == 0);
4922   match(Set dst (AddVL dst src));
4923   format %{ "paddq   $dst,$src\t! add packedL" %}
4924   ins_encode %{
4925     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
4926   %}
4927   ins_pipe( pipe_slow );
4928 %}
4929 
4930 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
4931   predicate(UseAVX > 0);
4932   match(Set dst (AddVL src1 src2));
4933   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
4934   ins_encode %{
4935     int vlen_enc = vector_length_encoding(this);
4936     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4937   %}
4938   ins_pipe( pipe_slow );
4939 %}
4940 
4941 instruct vaddL_mem(vec dst, vec src, memory mem) %{
4942   predicate((UseAVX > 0) &&
4943             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
4944   match(Set dst (AddVL src (LoadVector mem)));
4945   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
4946   ins_encode %{
4947     int vlen_enc = vector_length_encoding(this);
4948     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4949   %}
4950   ins_pipe( pipe_slow );
4951 %}
4952 
4953 // Floats vector add
4954 instruct vaddF(vec dst, vec src) %{
4955   predicate(UseAVX == 0);
4956   match(Set dst (AddVF dst src));
4957   format %{ "addps   $dst,$src\t! add packedF" %}
4958   ins_encode %{
4959     __ addps($dst$$XMMRegister, $src$$XMMRegister);
4960   %}
4961   ins_pipe( pipe_slow );
4962 %}
4963 
4964 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
4965   predicate(UseAVX > 0);
4966   match(Set dst (AddVF src1 src2));
4967   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
4968   ins_encode %{
4969     int vlen_enc = vector_length_encoding(this);
4970     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4971   %}
4972   ins_pipe( pipe_slow );
4973 %}
4974 
4975 instruct vaddF_mem(vec dst, vec src, memory mem) %{
4976   predicate((UseAVX > 0) &&
4977             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
4978   match(Set dst (AddVF src (LoadVector mem)));
4979   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
4980   ins_encode %{
4981     int vlen_enc = vector_length_encoding(this);
4982     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4983   %}
4984   ins_pipe( pipe_slow );
4985 %}
4986 
4987 // Doubles vector add
4988 instruct vaddD(vec dst, vec src) %{
4989   predicate(UseAVX == 0);
4990   match(Set dst (AddVD dst src));
4991   format %{ "addpd   $dst,$src\t! add packedD" %}
4992   ins_encode %{
4993     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
4994   %}
4995   ins_pipe( pipe_slow );
4996 %}
4997 
4998 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
4999   predicate(UseAVX > 0);
5000   match(Set dst (AddVD src1 src2));
5001   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
5002   ins_encode %{
5003     int vlen_enc = vector_length_encoding(this);
5004     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5005   %}
5006   ins_pipe( pipe_slow );
5007 %}
5008 
5009 instruct vaddD_mem(vec dst, vec src, memory mem) %{
5010   predicate((UseAVX > 0) &&
5011             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5012   match(Set dst (AddVD src (LoadVector mem)));
5013   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
5014   ins_encode %{
5015     int vlen_enc = vector_length_encoding(this);
5016     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5017   %}
5018   ins_pipe( pipe_slow );
5019 %}
5020 
5021 // --------------------------------- SUB --------------------------------------
5022 
5023 // Bytes vector sub
5024 instruct vsubB(vec dst, vec src) %{
5025   predicate(UseAVX == 0);
5026   match(Set dst (SubVB dst src));
5027   format %{ "psubb   $dst,$src\t! sub packedB" %}
5028   ins_encode %{
5029     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
5030   %}
5031   ins_pipe( pipe_slow );
5032 %}
5033 
5034 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
5035   predicate(UseAVX > 0);
5036   match(Set dst (SubVB src1 src2));
5037   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
5038   ins_encode %{
5039     int vlen_enc = vector_length_encoding(this);
5040     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5041   %}
5042   ins_pipe( pipe_slow );
5043 %}
5044 
5045 instruct vsubB_mem(vec dst, vec src, memory mem) %{
5046   predicate((UseAVX > 0) &&
5047             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5048   match(Set dst (SubVB src (LoadVector mem)));
5049   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
5050   ins_encode %{
5051     int vlen_enc = vector_length_encoding(this);
5052     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5053   %}
5054   ins_pipe( pipe_slow );
5055 %}
5056 
5057 // Shorts/Chars vector sub
5058 instruct vsubS(vec dst, vec src) %{
5059   predicate(UseAVX == 0);
5060   match(Set dst (SubVS dst src));
5061   format %{ "psubw   $dst,$src\t! sub packedS" %}
5062   ins_encode %{
5063     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
5064   %}
5065   ins_pipe( pipe_slow );
5066 %}
5067 
5068 
5069 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
5070   predicate(UseAVX > 0);
5071   match(Set dst (SubVS src1 src2));
5072   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
5073   ins_encode %{
5074     int vlen_enc = vector_length_encoding(this);
5075     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5076   %}
5077   ins_pipe( pipe_slow );
5078 %}
5079 
5080 instruct vsubS_mem(vec dst, vec src, memory mem) %{
5081   predicate((UseAVX > 0) &&
5082             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5083   match(Set dst (SubVS src (LoadVector mem)));
5084   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
5085   ins_encode %{
5086     int vlen_enc = vector_length_encoding(this);
5087     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5088   %}
5089   ins_pipe( pipe_slow );
5090 %}
5091 
5092 // Integers vector sub
5093 instruct vsubI(vec dst, vec src) %{
5094   predicate(UseAVX == 0);
5095   match(Set dst (SubVI dst src));
5096   format %{ "psubd   $dst,$src\t! sub packedI" %}
5097   ins_encode %{
5098     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
5099   %}
5100   ins_pipe( pipe_slow );
5101 %}
5102 
5103 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
5104   predicate(UseAVX > 0);
5105   match(Set dst (SubVI src1 src2));
5106   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
5107   ins_encode %{
5108     int vlen_enc = vector_length_encoding(this);
5109     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5110   %}
5111   ins_pipe( pipe_slow );
5112 %}
5113 
5114 instruct vsubI_mem(vec dst, vec src, memory mem) %{
5115   predicate((UseAVX > 0) &&
5116             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5117   match(Set dst (SubVI src (LoadVector mem)));
5118   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
5119   ins_encode %{
5120     int vlen_enc = vector_length_encoding(this);
5121     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5122   %}
5123   ins_pipe( pipe_slow );
5124 %}
5125 
5126 // Longs vector sub
5127 instruct vsubL(vec dst, vec src) %{
5128   predicate(UseAVX == 0);
5129   match(Set dst (SubVL dst src));
5130   format %{ "psubq   $dst,$src\t! sub packedL" %}
5131   ins_encode %{
5132     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
5133   %}
5134   ins_pipe( pipe_slow );
5135 %}
5136 
5137 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
5138   predicate(UseAVX > 0);
5139   match(Set dst (SubVL src1 src2));
5140   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
5141   ins_encode %{
5142     int vlen_enc = vector_length_encoding(this);
5143     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5144   %}
5145   ins_pipe( pipe_slow );
5146 %}
5147 
5148 
5149 instruct vsubL_mem(vec dst, vec src, memory mem) %{
5150   predicate((UseAVX > 0) &&
5151             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5152   match(Set dst (SubVL src (LoadVector mem)));
5153   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
5154   ins_encode %{
5155     int vlen_enc = vector_length_encoding(this);
5156     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5157   %}
5158   ins_pipe( pipe_slow );
5159 %}
5160 
5161 // Floats vector sub
5162 instruct vsubF(vec dst, vec src) %{
5163   predicate(UseAVX == 0);
5164   match(Set dst (SubVF dst src));
5165   format %{ "subps   $dst,$src\t! sub packedF" %}
5166   ins_encode %{
5167     __ subps($dst$$XMMRegister, $src$$XMMRegister);
5168   %}
5169   ins_pipe( pipe_slow );
5170 %}
5171 
5172 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
5173   predicate(UseAVX > 0);
5174   match(Set dst (SubVF src1 src2));
5175   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
5176   ins_encode %{
5177     int vlen_enc = vector_length_encoding(this);
5178     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5179   %}
5180   ins_pipe( pipe_slow );
5181 %}
5182 
5183 instruct vsubF_mem(vec dst, vec src, memory mem) %{
5184   predicate((UseAVX > 0) &&
5185             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5186   match(Set dst (SubVF src (LoadVector mem)));
5187   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
5188   ins_encode %{
5189     int vlen_enc = vector_length_encoding(this);
5190     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5191   %}
5192   ins_pipe( pipe_slow );
5193 %}
5194 
5195 // Doubles vector sub
5196 instruct vsubD(vec dst, vec src) %{
5197   predicate(UseAVX == 0);
5198   match(Set dst (SubVD dst src));
5199   format %{ "subpd   $dst,$src\t! sub packedD" %}
5200   ins_encode %{
5201     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
5202   %}
5203   ins_pipe( pipe_slow );
5204 %}
5205 
5206 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
5207   predicate(UseAVX > 0);
5208   match(Set dst (SubVD src1 src2));
5209   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
5210   ins_encode %{
5211     int vlen_enc = vector_length_encoding(this);
5212     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5213   %}
5214   ins_pipe( pipe_slow );
5215 %}
5216 
5217 instruct vsubD_mem(vec dst, vec src, memory mem) %{
5218   predicate((UseAVX > 0) &&
5219             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5220   match(Set dst (SubVD src (LoadVector mem)));
5221   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
5222   ins_encode %{
5223     int vlen_enc = vector_length_encoding(this);
5224     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5225   %}
5226   ins_pipe( pipe_slow );
5227 %}
5228 
5229 // --------------------------------- MUL --------------------------------------
5230 
5231 // Byte vector mul
5232 instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5233   predicate(Matcher::vector_length(n) == 4 ||
5234             Matcher::vector_length(n) == 8);
5235   match(Set dst (MulVB src1 src2));
5236   effect(TEMP dst, TEMP tmp, TEMP scratch);
5237   format %{"vector_mulB $dst,$src1,$src2" %}
5238   ins_encode %{
5239     assert(UseSSE > 3, "required");
5240     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
5241     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
5242     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
5243     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5244     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5245     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5246   %}
5247   ins_pipe( pipe_slow );
5248 %}
5249 
5250 instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5251   predicate(Matcher::vector_length(n) == 16 && UseAVX <= 1);
5252   match(Set dst (MulVB src1 src2));
5253   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5254   format %{"vector_mulB $dst,$src1,$src2" %}
5255   ins_encode %{
5256     assert(UseSSE > 3, "required");
5257     __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
5258     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
5259     __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
5260     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
5261     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
5262     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
5263     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
5264     __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
5265     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5266     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5267     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5268     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5269   %}
5270   ins_pipe( pipe_slow );
5271 %}
5272 
5273 instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5274   predicate(Matcher::vector_length(n) == 16 && UseAVX > 1);
5275   match(Set dst (MulVB src1 src2));
5276   effect(TEMP dst, TEMP tmp, TEMP scratch);
5277   format %{"vector_mulB $dst,$src1,$src2" %}
5278   ins_encode %{
5279   int vlen_enc = Assembler::AVX_256bit;
5280     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5281     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5282     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5283     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5284     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5285     __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
5286     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
5287   %}
5288   ins_pipe( pipe_slow );
5289 %}
5290 
5291 instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5292   predicate(Matcher::vector_length(n) == 32);
5293   match(Set dst (MulVB src1 src2));
5294   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5295   format %{"vector_mulB $dst,$src1,$src2" %}
5296   ins_encode %{
5297     assert(UseAVX > 1, "required");
5298     int vlen_enc = Assembler::AVX_256bit;
5299     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5300     __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
5301     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5302     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5303     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5304     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5305     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5306     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5307     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5308     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5309     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5310     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5311     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5312     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
5313   %}
5314   ins_pipe( pipe_slow );
5315 %}
5316 
5317 instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5318   predicate(Matcher::vector_length(n) == 64);
5319   match(Set dst (MulVB src1 src2));
5320   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5321   format %{"vector_mulB $dst,$src1,$src2\n\t" %}
5322   ins_encode %{
5323     assert(UseAVX > 2, "required");
5324     int vlen_enc = Assembler::AVX_512bit;
5325     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5326     __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
5327     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5328     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5329     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5330     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5331     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5332     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5333     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5334     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5335     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5336     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5337     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5338     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
5339     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5340   %}
5341   ins_pipe( pipe_slow );
5342 %}
5343 
5344 // Shorts/Chars vector mul
5345 instruct vmulS(vec dst, vec src) %{
5346   predicate(UseAVX == 0);
5347   match(Set dst (MulVS dst src));
5348   format %{ "pmullw $dst,$src\t! mul packedS" %}
5349   ins_encode %{
5350     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
5351   %}
5352   ins_pipe( pipe_slow );
5353 %}
5354 
5355 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
5356   predicate(UseAVX > 0);
5357   match(Set dst (MulVS src1 src2));
5358   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
5359   ins_encode %{
5360     int vlen_enc = vector_length_encoding(this);
5361     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5362   %}
5363   ins_pipe( pipe_slow );
5364 %}
5365 
5366 instruct vmulS_mem(vec dst, vec src, memory mem) %{
5367   predicate((UseAVX > 0) &&
5368             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5369   match(Set dst (MulVS src (LoadVector mem)));
5370   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
5371   ins_encode %{
5372     int vlen_enc = vector_length_encoding(this);
5373     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5374   %}
5375   ins_pipe( pipe_slow );
5376 %}
5377 
5378 // Integers vector mul
5379 instruct vmulI(vec dst, vec src) %{
5380   predicate(UseAVX == 0);
5381   match(Set dst (MulVI dst src));
5382   format %{ "pmulld  $dst,$src\t! mul packedI" %}
5383   ins_encode %{
5384     assert(UseSSE > 3, "required");
5385     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
5386   %}
5387   ins_pipe( pipe_slow );
5388 %}
5389 
5390 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
5391   predicate(UseAVX > 0);
5392   match(Set dst (MulVI src1 src2));
5393   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
5394   ins_encode %{
5395     int vlen_enc = vector_length_encoding(this);
5396     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5397   %}
5398   ins_pipe( pipe_slow );
5399 %}
5400 
5401 instruct vmulI_mem(vec dst, vec src, memory mem) %{
5402   predicate((UseAVX > 0) &&
5403             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5404   match(Set dst (MulVI src (LoadVector mem)));
5405   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
5406   ins_encode %{
5407     int vlen_enc = vector_length_encoding(this);
5408     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5409   %}
5410   ins_pipe( pipe_slow );
5411 %}
5412 
5413 // Longs vector mul
5414 instruct vmulL_reg(vec dst, vec src1, vec src2) %{
5415   predicate(VM_Version::supports_avx512dq());
5416   match(Set dst (MulVL src1 src2));
5417   format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
5418   ins_encode %{
5419     assert(UseAVX > 2, "required");
5420     int vlen_enc = vector_length_encoding(this);
5421     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5422   %}
5423   ins_pipe( pipe_slow );
5424 %}
5425 
5426 instruct vmulL_mem(vec dst, vec src, memory mem) %{
5427   predicate(VM_Version::supports_avx512dq() &&
5428               (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5429   match(Set dst (MulVL src (LoadVector mem)));
5430   format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
5431   ins_encode %{
5432     assert(UseAVX > 2, "required");
5433     int vlen_enc = vector_length_encoding(this);
5434     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5435   %}
5436   ins_pipe( pipe_slow );
5437 %}
5438 
5439 instruct mul2L_reg(vec dst, vec src2, legVec tmp) %{
5440   predicate(Matcher::vector_length(n) == 2 && !VM_Version::supports_avx512dq());
5441   match(Set dst (MulVL dst src2));
5442   effect(TEMP dst, TEMP tmp);
5443   format %{ "pshufd $tmp,$src2, 177\n\t"
5444             "pmulld $tmp,$dst\n\t"
5445             "phaddd $tmp,$tmp\n\t"
5446             "pmovzxdq $tmp,$tmp\n\t"
5447             "psllq $tmp, 32\n\t"
5448             "pmuludq $dst,$src2\n\t"
5449             "paddq $dst,$tmp\n\t! mul packed2L" %}
5450 
5451   ins_encode %{
5452     assert(VM_Version::supports_sse4_1(), "required");
5453     int vlen_enc = Assembler::AVX_128bit;
5454     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
5455     __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
5456     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5457     __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
5458     __ psllq($tmp$$XMMRegister, 32);
5459     __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
5460     __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
5461   %}
5462   ins_pipe( pipe_slow );
5463 %}
5464 
5465 instruct vmul4L_reg_avx(vec dst, vec src1, vec src2, legVec tmp, legVec tmp1) %{
5466   predicate(Matcher::vector_length(n) == 4 && !VM_Version::supports_avx512dq());
5467   match(Set dst (MulVL src1 src2));
5468   effect(TEMP tmp1, TEMP tmp);
5469   format %{ "vpshufd $tmp,$src2\n\t"
5470             "vpmulld $tmp,$src1,$tmp\n\t"
5471             "vphaddd $tmp,$tmp,$tmp\n\t"
5472             "vpmovzxdq $tmp,$tmp\n\t"
5473             "vpsllq $tmp,$tmp\n\t"
5474             "vpmuludq $tmp1,$src1,$src2\n\t"
5475             "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
5476   ins_encode %{
5477     int vlen_enc = Assembler::AVX_256bit;
5478     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vlen_enc);
5479     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5480     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
5481     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5482     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5483     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vlen_enc);
5484     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5485     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5486   %}
5487   ins_pipe( pipe_slow );
5488 %}
5489 
5490 // Floats vector mul
5491 instruct vmulF(vec dst, vec src) %{
5492   predicate(UseAVX == 0);
5493   match(Set dst (MulVF dst src));
5494   format %{ "mulps   $dst,$src\t! mul packedF" %}
5495   ins_encode %{
5496     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
5497   %}
5498   ins_pipe( pipe_slow );
5499 %}
5500 
5501 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
5502   predicate(UseAVX > 0);
5503   match(Set dst (MulVF src1 src2));
5504   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
5505   ins_encode %{
5506     int vlen_enc = vector_length_encoding(this);
5507     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5508   %}
5509   ins_pipe( pipe_slow );
5510 %}
5511 
5512 instruct vmulF_mem(vec dst, vec src, memory mem) %{
5513   predicate((UseAVX > 0) &&
5514             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5515   match(Set dst (MulVF src (LoadVector mem)));
5516   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
5517   ins_encode %{
5518     int vlen_enc = vector_length_encoding(this);
5519     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5520   %}
5521   ins_pipe( pipe_slow );
5522 %}
5523 
5524 // Doubles vector mul
5525 instruct vmulD(vec dst, vec src) %{
5526   predicate(UseAVX == 0);
5527   match(Set dst (MulVD dst src));
5528   format %{ "mulpd   $dst,$src\t! mul packedD" %}
5529   ins_encode %{
5530     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
5531   %}
5532   ins_pipe( pipe_slow );
5533 %}
5534 
5535 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
5536   predicate(UseAVX > 0);
5537   match(Set dst (MulVD src1 src2));
5538   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
5539   ins_encode %{
5540     int vlen_enc = vector_length_encoding(this);
5541     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5542   %}
5543   ins_pipe( pipe_slow );
5544 %}
5545 
5546 instruct vmulD_mem(vec dst, vec src, memory mem) %{
5547   predicate((UseAVX > 0) &&
5548             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5549   match(Set dst (MulVD src (LoadVector mem)));
5550   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
5551   ins_encode %{
5552     int vlen_enc = vector_length_encoding(this);
5553     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5554   %}
5555   ins_pipe( pipe_slow );
5556 %}
5557 
5558 instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5559   predicate(Matcher::vector_length(n) == 8);
5560   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
5561   effect(TEMP dst, USE src1, USE src2);
5562   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
5563             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
5564          %}
5565   ins_encode %{
5566     assert(UseAVX > 0, "required");
5567 
5568     int vlen_enc = Assembler::AVX_256bit;
5569     int cond = (Assembler::Condition)($copnd$$cmpcode);
5570     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5571     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5572   %}
5573   ins_pipe( pipe_slow );
5574 %}
5575 
5576 instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5577   predicate(Matcher::vector_length(n) == 4);
5578   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
5579   effect(TEMP dst, USE src1, USE src2);
5580   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
5581             "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
5582          %}
5583   ins_encode %{
5584     assert(UseAVX > 0, "required");
5585 
5586     int vlen_enc = Assembler::AVX_256bit;
5587     int cond = (Assembler::Condition)($copnd$$cmpcode);
5588     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5589     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5590   %}
5591   ins_pipe( pipe_slow );
5592 %}
5593 
5594 // --------------------------------- DIV --------------------------------------
5595 
5596 // Floats vector div
5597 instruct vdivF(vec dst, vec src) %{
5598   predicate(UseAVX == 0);
5599   match(Set dst (DivVF dst src));
5600   format %{ "divps   $dst,$src\t! div packedF" %}
5601   ins_encode %{
5602     __ divps($dst$$XMMRegister, $src$$XMMRegister);
5603   %}
5604   ins_pipe( pipe_slow );
5605 %}
5606 
5607 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
5608   predicate(UseAVX > 0);
5609   match(Set dst (DivVF src1 src2));
5610   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
5611   ins_encode %{
5612     int vlen_enc = vector_length_encoding(this);
5613     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5614   %}
5615   ins_pipe( pipe_slow );
5616 %}
5617 
5618 instruct vdivF_mem(vec dst, vec src, memory mem) %{
5619   predicate((UseAVX > 0) &&
5620             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5621   match(Set dst (DivVF src (LoadVector mem)));
5622   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
5623   ins_encode %{
5624     int vlen_enc = vector_length_encoding(this);
5625     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5626   %}
5627   ins_pipe( pipe_slow );
5628 %}
5629 
5630 // Doubles vector div
5631 instruct vdivD(vec dst, vec src) %{
5632   predicate(UseAVX == 0);
5633   match(Set dst (DivVD dst src));
5634   format %{ "divpd   $dst,$src\t! div packedD" %}
5635   ins_encode %{
5636     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
5637   %}
5638   ins_pipe( pipe_slow );
5639 %}
5640 
5641 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
5642   predicate(UseAVX > 0);
5643   match(Set dst (DivVD src1 src2));
5644   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
5645   ins_encode %{
5646     int vlen_enc = vector_length_encoding(this);
5647     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5648   %}
5649   ins_pipe( pipe_slow );
5650 %}
5651 
5652 instruct vdivD_mem(vec dst, vec src, memory mem) %{
5653   predicate((UseAVX > 0) &&
5654             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5655   match(Set dst (DivVD src (LoadVector mem)));
5656   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
5657   ins_encode %{
5658     int vlen_enc = vector_length_encoding(this);
5659     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5660   %}
5661   ins_pipe( pipe_slow );
5662 %}
5663 
5664 // ------------------------------ MinMax ---------------------------------------
5665 
5666 // Byte, Short, Int vector Min/Max
5667 instruct minmax_reg_sse(vec dst, vec src) %{
5668   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5669             UseAVX == 0);
5670   match(Set dst (MinV dst src));
5671   match(Set dst (MaxV dst src));
5672   format %{ "vector_minmax  $dst,$src\t!  " %}
5673   ins_encode %{
5674     assert(UseSSE >= 4, "required");
5675 
5676     int opcode = this->ideal_Opcode();
5677     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5678     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
5679   %}
5680   ins_pipe( pipe_slow );
5681 %}
5682 
5683 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
5684   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5685             UseAVX > 0);
5686   match(Set dst (MinV src1 src2));
5687   match(Set dst (MaxV src1 src2));
5688   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
5689   ins_encode %{
5690     int opcode = this->ideal_Opcode();
5691     int vlen_enc = vector_length_encoding(this);
5692     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5693 
5694     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5695   %}
5696   ins_pipe( pipe_slow );
5697 %}
5698 
5699 // Long vector Min/Max
5700 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
5701   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
5702             UseAVX == 0);
5703   match(Set dst (MinV dst src));
5704   match(Set dst (MaxV src dst));
5705   effect(TEMP dst, TEMP tmp);
5706   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
5707   ins_encode %{
5708     assert(UseSSE >= 4, "required");
5709 
5710     int opcode = this->ideal_Opcode();
5711     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5712     assert(elem_bt == T_LONG, "sanity");
5713 
5714     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
5715   %}
5716   ins_pipe( pipe_slow );
5717 %}
5718 
5719 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
5720   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
5721             UseAVX > 0 && !VM_Version::supports_avx512vl());
5722   match(Set dst (MinV src1 src2));
5723   match(Set dst (MaxV src1 src2));
5724   effect(TEMP dst);
5725   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
5726   ins_encode %{
5727     int vlen_enc = vector_length_encoding(this);
5728     int opcode = this->ideal_Opcode();
5729     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5730     assert(elem_bt == T_LONG, "sanity");
5731 
5732     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5733   %}
5734   ins_pipe( pipe_slow );
5735 %}
5736 
5737 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
5738   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
5739             Matcher::vector_element_basic_type(n) == T_LONG);
5740   match(Set dst (MinV src1 src2));
5741   match(Set dst (MaxV src1 src2));
5742   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
5743   ins_encode %{
5744     assert(UseAVX > 2, "required");
5745 
5746     int vlen_enc = vector_length_encoding(this);
5747     int opcode = this->ideal_Opcode();
5748     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5749     assert(elem_bt == T_LONG, "sanity");
5750 
5751     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5752   %}
5753   ins_pipe( pipe_slow );
5754 %}
5755 
5756 // Float/Double vector Min/Max
5757 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
5758   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
5759             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
5760             UseAVX > 0);
5761   match(Set dst (MinV a b));
5762   match(Set dst (MaxV a b));
5763   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
5764   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
5765   ins_encode %{
5766     assert(UseAVX > 0, "required");
5767 
5768     int opcode = this->ideal_Opcode();
5769     int vlen_enc = vector_length_encoding(this);
5770     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5771 
5772     __ vminmax_fp(opcode, elem_bt,
5773                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
5774                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
5775   %}
5776   ins_pipe( pipe_slow );
5777 %}
5778 
5779 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
5780   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
5781             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
5782   match(Set dst (MinV a b));
5783   match(Set dst (MaxV a b));
5784   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
5785   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
5786   ins_encode %{
5787     assert(UseAVX > 2, "required");
5788 
5789     int opcode = this->ideal_Opcode();
5790     int vlen_enc = vector_length_encoding(this);
5791     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5792 
5793     __ evminmax_fp(opcode, elem_bt,
5794                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
5795                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
5796   %}
5797   ins_pipe( pipe_slow );
5798 %}
5799 
5800 // --------------------------------- Signum/CopySign ---------------------------
5801 
5802 instruct signumF_reg(regF dst, regF zero, regF one, rRegP scratch, rFlagsReg cr) %{
5803   match(Set dst (SignumF dst (Binary zero one)));
5804   effect(TEMP scratch, KILL cr);
5805   format %{ "signumF $dst, $dst\t! using $scratch as TEMP" %}
5806   ins_encode %{
5807     int opcode = this->ideal_Opcode();
5808     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
5809   %}
5810   ins_pipe( pipe_slow );
5811 %}
5812 
5813 instruct signumD_reg(regD dst, regD zero, regD one, rRegP scratch, rFlagsReg cr) %{
5814   match(Set dst (SignumD dst (Binary zero one)));
5815   effect(TEMP scratch, KILL cr);
5816   format %{ "signumD $dst, $dst\t! using $scratch as TEMP" %}
5817   ins_encode %{
5818     int opcode = this->ideal_Opcode();
5819     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
5820   %}
5821   ins_pipe( pipe_slow );
5822 %}
5823 
5824 // ---------------------------------------
5825 // For copySign use 0xE4 as writemask for vpternlog
5826 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
5827 // C (xmm2) is set to 0x7FFFFFFF
5828 // Wherever xmm2 is 0, we want to pick from B (sign)
5829 // Wherever xmm2 is 1, we want to pick from A (src)
5830 //
5831 // A B C Result
5832 // 0 0 0 0
5833 // 0 0 1 0
5834 // 0 1 0 1
5835 // 0 1 1 0
5836 // 1 0 0 0
5837 // 1 0 1 1
5838 // 1 1 0 1
5839 // 1 1 1 1
5840 //
5841 // Result going from high bit to low bit is 0x11100100 = 0xe4
5842 // ---------------------------------------
5843 
5844 #ifdef _LP64
5845 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
5846   match(Set dst (CopySignF dst src));
5847   effect(TEMP tmp1, TEMP tmp2);
5848   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
5849   ins_encode %{
5850     __ movl($tmp2$$Register, 0x7FFFFFFF);
5851     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
5852     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
5853   %}
5854   ins_pipe( pipe_slow );
5855 %}
5856 
5857 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
5858   match(Set dst (CopySignD dst (Binary src zero)));
5859   ins_cost(100);
5860   effect(TEMP tmp1, TEMP tmp2);
5861   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
5862   ins_encode %{
5863     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
5864     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
5865     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
5866   %}
5867   ins_pipe( pipe_slow );
5868 %}
5869 #endif // _LP64
5870 
5871 // --------------------------------- Sqrt --------------------------------------
5872 
5873 instruct vsqrtF_reg(vec dst, vec src) %{
5874   match(Set dst (SqrtVF src));
5875   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
5876   ins_encode %{
5877     assert(UseAVX > 0, "required");
5878     int vlen_enc = vector_length_encoding(this);
5879     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5880   %}
5881   ins_pipe( pipe_slow );
5882 %}
5883 
5884 instruct vsqrtF_mem(vec dst, memory mem) %{
5885   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
5886   match(Set dst (SqrtVF (LoadVector mem)));
5887   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
5888   ins_encode %{
5889     assert(UseAVX > 0, "required");
5890     int vlen_enc = vector_length_encoding(this);
5891     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
5892   %}
5893   ins_pipe( pipe_slow );
5894 %}
5895 
5896 // Floating point vector sqrt
5897 instruct vsqrtD_reg(vec dst, vec src) %{
5898   match(Set dst (SqrtVD src));
5899   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
5900   ins_encode %{
5901     assert(UseAVX > 0, "required");
5902     int vlen_enc = vector_length_encoding(this);
5903     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5904   %}
5905   ins_pipe( pipe_slow );
5906 %}
5907 
5908 instruct vsqrtD_mem(vec dst, memory mem) %{
5909   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
5910   match(Set dst (SqrtVD (LoadVector mem)));
5911   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
5912   ins_encode %{
5913     assert(UseAVX > 0, "required");
5914     int vlen_enc = vector_length_encoding(this);
5915     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
5916   %}
5917   ins_pipe( pipe_slow );
5918 %}
5919 
5920 // ------------------------------ Shift ---------------------------------------
5921 
5922 // Left and right shift count vectors are the same on x86
5923 // (only lowest bits of xmm reg are used for count).
5924 instruct vshiftcnt(vec dst, rRegI cnt) %{
5925   match(Set dst (LShiftCntV cnt));
5926   match(Set dst (RShiftCntV cnt));
5927   format %{ "movdl    $dst,$cnt\t! load shift count" %}
5928   ins_encode %{
5929     __ movdl($dst$$XMMRegister, $cnt$$Register);
5930   %}
5931   ins_pipe( pipe_slow );
5932 %}
5933 
5934 // Byte vector shift
5935 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5936   predicate(Matcher::vector_length(n) <= 8 && VectorNode::is_vshift_cnt(n->in(2)));
5937   match(Set dst ( LShiftVB src shift));
5938   match(Set dst ( RShiftVB src shift));
5939   match(Set dst (URShiftVB src shift));
5940   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
5941   format %{"vector_byte_shift $dst,$src,$shift" %}
5942   ins_encode %{
5943     assert(UseSSE > 3, "required");
5944     int opcode = this->ideal_Opcode();
5945     bool sign = (opcode != Op_URShiftVB);
5946     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
5947     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
5948     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5949     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5950     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5951   %}
5952   ins_pipe( pipe_slow );
5953 %}
5954 
5955 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
5956   predicate(Matcher::vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) &&
5957             UseAVX <= 1);
5958   match(Set dst ( LShiftVB src shift));
5959   match(Set dst ( RShiftVB src shift));
5960   match(Set dst (URShiftVB src shift));
5961   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
5962   format %{"vector_byte_shift $dst,$src,$shift" %}
5963   ins_encode %{
5964     assert(UseSSE > 3, "required");
5965     int opcode = this->ideal_Opcode();
5966     bool sign = (opcode != Op_URShiftVB);
5967     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
5968     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
5969     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
5970     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
5971     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
5972     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5973     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5974     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5975     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5976   %}
5977   ins_pipe( pipe_slow );
5978 %}
5979 
5980 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5981   predicate(Matcher::vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) &&
5982             UseAVX > 1);
5983   match(Set dst ( LShiftVB src shift));
5984   match(Set dst ( RShiftVB src shift));
5985   match(Set dst (URShiftVB src shift));
5986   effect(TEMP dst, TEMP tmp, TEMP scratch);
5987   format %{"vector_byte_shift $dst,$src,$shift" %}
5988   ins_encode %{
5989     int opcode = this->ideal_Opcode();
5990     bool sign = (opcode != Op_URShiftVB);
5991     int vlen_enc = Assembler::AVX_256bit;
5992     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
5993     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5994     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5995     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
5996     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
5997   %}
5998   ins_pipe( pipe_slow );
5999 %}
6000 
6001 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6002   predicate(Matcher::vector_length(n) == 32 && VectorNode::is_vshift_cnt(n->in(2)));
6003   match(Set dst ( LShiftVB src shift));
6004   match(Set dst ( RShiftVB src shift));
6005   match(Set dst (URShiftVB src shift));
6006   effect(TEMP dst, TEMP tmp, TEMP scratch);
6007   format %{"vector_byte_shift $dst,$src,$shift" %}
6008   ins_encode %{
6009     assert(UseAVX > 1, "required");
6010     int opcode = this->ideal_Opcode();
6011     bool sign = (opcode != Op_URShiftVB);
6012     int vlen_enc = Assembler::AVX_256bit;
6013     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
6014     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6015     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6016     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6017     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6018     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6019     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6020     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6021     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6022   %}
6023   ins_pipe( pipe_slow );
6024 %}
6025 
6026 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
6027   predicate(Matcher::vector_length(n) == 64 && VectorNode::is_vshift_cnt(n->in(2)));
6028   match(Set dst ( LShiftVB src shift));
6029   match(Set dst  (RShiftVB src shift));
6030   match(Set dst (URShiftVB src shift));
6031   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
6032   format %{"vector_byte_shift $dst,$src,$shift" %}
6033   ins_encode %{
6034     assert(UseAVX > 2, "required");
6035     int opcode = this->ideal_Opcode();
6036     bool sign = (opcode != Op_URShiftVB);
6037     int vlen_enc = Assembler::AVX_512bit;
6038     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
6039     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
6040     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6041     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6042     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6043     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6044     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6045     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6046     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6047     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
6048     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
6049     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6050   %}
6051   ins_pipe( pipe_slow );
6052 %}
6053 
6054 // Shorts vector logical right shift produces incorrect Java result
6055 // for negative data because java code convert short value into int with
6056 // sign extension before a shift. But char vectors are fine since chars are
6057 // unsigned values.
6058 // Shorts/Chars vector left shift
6059 instruct vshiftS(vec dst, vec src, vec shift) %{
6060   predicate(VectorNode::is_vshift_cnt(n->in(2)));
6061   match(Set dst ( LShiftVS src shift));
6062   match(Set dst ( RShiftVS src shift));
6063   match(Set dst (URShiftVS src shift));
6064   effect(TEMP dst, USE src, USE shift);
6065   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
6066   ins_encode %{
6067     int opcode = this->ideal_Opcode();
6068     if (UseAVX > 0) {
6069       int vlen_enc = vector_length_encoding(this);
6070       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6071     } else {
6072       int vlen = Matcher::vector_length(this);
6073       if (vlen == 2) {
6074         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6075         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6076       } else if (vlen == 4) {
6077         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6078         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6079       } else {
6080         assert (vlen == 8, "sanity");
6081         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6082         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6083       }
6084     }
6085   %}
6086   ins_pipe( pipe_slow );
6087 %}
6088 
6089 // Integers vector left shift
6090 instruct vshiftI(vec dst, vec src, vec shift) %{
6091   predicate(VectorNode::is_vshift_cnt(n->in(2)));
6092   match(Set dst ( LShiftVI src shift));
6093   match(Set dst ( RShiftVI src shift));
6094   match(Set dst (URShiftVI src shift));
6095   effect(TEMP dst, USE src, USE shift);
6096   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
6097   ins_encode %{
6098     int opcode = this->ideal_Opcode();
6099     if (UseAVX > 0) {
6100       int vlen_enc = vector_length_encoding(this);
6101       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6102     } else {
6103       int vlen = Matcher::vector_length(this);
6104       if (vlen == 2) {
6105         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6106         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6107       } else {
6108         assert(vlen == 4, "sanity");
6109         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6110         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6111       }
6112     }
6113   %}
6114   ins_pipe( pipe_slow );
6115 %}
6116 
6117 // Integers vector left constant shift
6118 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
6119   match(Set dst (LShiftVI src (LShiftCntV shift)));
6120   match(Set dst (RShiftVI src (RShiftCntV shift)));
6121   match(Set dst (URShiftVI src (RShiftCntV shift)));
6122   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
6123   ins_encode %{
6124     int opcode = this->ideal_Opcode();
6125     if (UseAVX > 0) {
6126       int vector_len = vector_length_encoding(this);
6127       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6128     } else {
6129       int vlen = Matcher::vector_length(this);
6130       if (vlen == 2) {
6131         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6132         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6133       } else {
6134         assert(vlen == 4, "sanity");
6135         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6136         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6137       }
6138     }
6139   %}
6140   ins_pipe( pipe_slow );
6141 %}
6142 
6143 // Longs vector shift
6144 instruct vshiftL(vec dst, vec src, vec shift) %{
6145   predicate(VectorNode::is_vshift_cnt(n->in(2)));
6146   match(Set dst ( LShiftVL src shift));
6147   match(Set dst (URShiftVL src shift));
6148   effect(TEMP dst, USE src, USE shift);
6149   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
6150   ins_encode %{
6151     int opcode = this->ideal_Opcode();
6152     if (UseAVX > 0) {
6153       int vlen_enc = vector_length_encoding(this);
6154       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6155     } else {
6156       assert(Matcher::vector_length(this) == 2, "");
6157       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6158       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6159     }
6160   %}
6161   ins_pipe( pipe_slow );
6162 %}
6163 
6164 // Longs vector constant shift
6165 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
6166   match(Set dst (LShiftVL src (LShiftCntV shift)));
6167   match(Set dst (URShiftVL src (RShiftCntV shift)));
6168   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
6169   ins_encode %{
6170     int opcode = this->ideal_Opcode();
6171     if (UseAVX > 0) {
6172       int vector_len = vector_length_encoding(this);
6173       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6174     } else {
6175       assert(Matcher::vector_length(this) == 2, "");
6176       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6177       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6178     }
6179   %}
6180   ins_pipe( pipe_slow );
6181 %}
6182 
6183 // -------------------ArithmeticRightShift -----------------------------------
6184 // Long vector arithmetic right shift
6185 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6186   predicate(VectorNode::is_vshift_cnt(n->in(2)) && UseAVX <= 2);
6187   match(Set dst (RShiftVL src shift));
6188   effect(TEMP dst, TEMP tmp, TEMP scratch);
6189   format %{ "vshiftq $dst,$src,$shift" %}
6190   ins_encode %{
6191     uint vlen = Matcher::vector_length(this);
6192     if (vlen == 2) {
6193       assert(UseSSE >= 2, "required");
6194       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6195       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
6196       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6197       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
6198       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
6199       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
6200     } else {
6201       assert(vlen == 4, "sanity");
6202       assert(UseAVX > 1, "required");
6203       int vlen_enc = Assembler::AVX_256bit;
6204       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6205       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6206       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6207       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6208       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6209     }
6210   %}
6211   ins_pipe( pipe_slow );
6212 %}
6213 
6214 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
6215   predicate(VectorNode::is_vshift_cnt(n->in(2)) && UseAVX > 2);
6216   match(Set dst (RShiftVL src shift));
6217   format %{ "vshiftq $dst,$src,$shift" %}
6218   ins_encode %{
6219     int vlen_enc = vector_length_encoding(this);
6220     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6221   %}
6222   ins_pipe( pipe_slow );
6223 %}
6224 
6225 // ------------------- Variable Shift -----------------------------
6226 // Byte variable shift
6227 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6228   predicate(Matcher::vector_length(n) <= 8 &&
6229             !VectorNode::is_vshift_cnt(n->in(2)) &&
6230             !VM_Version::supports_avx512bw());
6231   match(Set dst ( LShiftVB src shift));
6232   match(Set dst ( RShiftVB src shift));
6233   match(Set dst (URShiftVB src shift));
6234   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6235   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6236   ins_encode %{
6237     assert(UseAVX >= 2, "required");
6238 
6239     int opcode = this->ideal_Opcode();
6240     int vlen_enc = Assembler::AVX_128bit;
6241     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6242     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6243   %}
6244   ins_pipe( pipe_slow );
6245 %}
6246 
6247 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6248   predicate(Matcher::vector_length(n) == 16 &&
6249             !VectorNode::is_vshift_cnt(n->in(2)) &&
6250             !VM_Version::supports_avx512bw());
6251   match(Set dst ( LShiftVB src shift));
6252   match(Set dst ( RShiftVB src shift));
6253   match(Set dst (URShiftVB src shift));
6254   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6255   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6256   ins_encode %{
6257     assert(UseAVX >= 2, "required");
6258 
6259     int opcode = this->ideal_Opcode();
6260     int vlen_enc = Assembler::AVX_128bit;
6261     // Shift lower half and get word result in dst
6262     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6263 
6264     // Shift upper half and get word result in vtmp1
6265     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6266     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6267     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6268 
6269     // Merge and down convert the two word results to byte in dst
6270     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6271   %}
6272   ins_pipe( pipe_slow );
6273 %}
6274 
6275 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{
6276   predicate(Matcher::vector_length(n) == 32 &&
6277             !VectorNode::is_vshift_cnt(n->in(2)) &&
6278             !VM_Version::supports_avx512bw());
6279   match(Set dst ( LShiftVB src shift));
6280   match(Set dst ( RShiftVB src shift));
6281   match(Set dst (URShiftVB src shift));
6282   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch);
6283   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %}
6284   ins_encode %{
6285     assert(UseAVX >= 2, "required");
6286 
6287     int opcode = this->ideal_Opcode();
6288     int vlen_enc = Assembler::AVX_128bit;
6289     // Process lower 128 bits and get result in dst
6290     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6291     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6292     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6293     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6294     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6295 
6296     // Process higher 128 bits and get result in vtmp3
6297     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6298     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6299     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register);
6300     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
6301     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
6302     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6303     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
6304 
6305     // Merge the two results in dst
6306     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6307   %}
6308   ins_pipe( pipe_slow );
6309 %}
6310 
6311 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6312   predicate(Matcher::vector_length(n) <= 32 &&
6313             !VectorNode::is_vshift_cnt(n->in(2)) &&
6314             VM_Version::supports_avx512bw());
6315   match(Set dst ( LShiftVB src shift));
6316   match(Set dst ( RShiftVB src shift));
6317   match(Set dst (URShiftVB src shift));
6318   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6319   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6320   ins_encode %{
6321     assert(UseAVX > 2, "required");
6322 
6323     int opcode = this->ideal_Opcode();
6324     int vlen_enc = vector_length_encoding(this);
6325     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6326   %}
6327   ins_pipe( pipe_slow );
6328 %}
6329 
6330 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6331   predicate(Matcher::vector_length(n) == 64 &&
6332             !VectorNode::is_vshift_cnt(n->in(2)) &&
6333             VM_Version::supports_avx512bw());
6334   match(Set dst ( LShiftVB src shift));
6335   match(Set dst ( RShiftVB src shift));
6336   match(Set dst (URShiftVB src shift));
6337   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6338   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6339   ins_encode %{
6340     assert(UseAVX > 2, "required");
6341 
6342     int opcode = this->ideal_Opcode();
6343     int vlen_enc = Assembler::AVX_256bit;
6344     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6345     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6346     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6347     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6348     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6349   %}
6350   ins_pipe( pipe_slow );
6351 %}
6352 
6353 // Short variable shift
6354 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6355   predicate(Matcher::vector_length(n) <= 8 &&
6356             !VectorNode::is_vshift_cnt(n->in(2)) &&
6357             !VM_Version::supports_avx512bw());
6358   match(Set dst ( LShiftVS src shift));
6359   match(Set dst ( RShiftVS src shift));
6360   match(Set dst (URShiftVS src shift));
6361   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6362   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6363   ins_encode %{
6364     assert(UseAVX >= 2, "required");
6365 
6366     int opcode = this->ideal_Opcode();
6367     bool sign = (opcode != Op_URShiftVS);
6368     int vlen_enc = Assembler::AVX_256bit;
6369     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
6370     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
6371     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
6372     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6373     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
6374     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6375   %}
6376   ins_pipe( pipe_slow );
6377 %}
6378 
6379 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6380   predicate(Matcher::vector_length(n) == 16 &&
6381             !VectorNode::is_vshift_cnt(n->in(2)) &&
6382             !VM_Version::supports_avx512bw());
6383   match(Set dst ( LShiftVS src shift));
6384   match(Set dst ( RShiftVS src shift));
6385   match(Set dst (URShiftVS src shift));
6386   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6387   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6388   ins_encode %{
6389     assert(UseAVX >= 2, "required");
6390 
6391     int opcode = this->ideal_Opcode();
6392     bool sign = (opcode != Op_URShiftVS);
6393     int vlen_enc = Assembler::AVX_256bit;
6394     // Shift lower half, with result in vtmp2 usign vtmp1 as TEMP
6395     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6396     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6397     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6398     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6399 
6400     // Shift upper half, with result in dst usign vtmp1 as TEMP
6401     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
6402     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
6403     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6404     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6405     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6406     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6407 
6408     // Merge lower and upper half result into dst
6409     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6410     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6411   %}
6412   ins_pipe( pipe_slow );
6413 %}
6414 
6415 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
6416   predicate(!VectorNode::is_vshift_cnt(n->in(2)) &&
6417             VM_Version::supports_avx512bw());
6418   match(Set dst ( LShiftVS src shift));
6419   match(Set dst ( RShiftVS src shift));
6420   match(Set dst (URShiftVS src shift));
6421   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
6422   ins_encode %{
6423     assert(UseAVX > 2, "required");
6424 
6425     int opcode = this->ideal_Opcode();
6426     int vlen_enc = vector_length_encoding(this);
6427     if (!VM_Version::supports_avx512vl()) {
6428       vlen_enc = Assembler::AVX_512bit;
6429     }
6430     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6431   %}
6432   ins_pipe( pipe_slow );
6433 %}
6434 
6435 //Integer variable shift
6436 instruct vshiftI_var(vec dst, vec src, vec shift) %{
6437   predicate(!VectorNode::is_vshift_cnt(n->in(2)));
6438   match(Set dst ( LShiftVI src shift));
6439   match(Set dst ( RShiftVI src shift));
6440   match(Set dst (URShiftVI src shift));
6441   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
6442   ins_encode %{
6443     assert(UseAVX >= 2, "required");
6444 
6445     int opcode = this->ideal_Opcode();
6446     int vlen_enc = vector_length_encoding(this);
6447     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6448   %}
6449   ins_pipe( pipe_slow );
6450 %}
6451 
6452 //Long variable shift
6453 instruct vshiftL_var(vec dst, vec src, vec shift) %{
6454   predicate(!VectorNode::is_vshift_cnt(n->in(2)));
6455   match(Set dst ( LShiftVL src shift));
6456   match(Set dst (URShiftVL src shift));
6457   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
6458   ins_encode %{
6459     assert(UseAVX >= 2, "required");
6460 
6461     int opcode = this->ideal_Opcode();
6462     int vlen_enc = vector_length_encoding(this);
6463     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6464   %}
6465   ins_pipe( pipe_slow );
6466 %}
6467 
6468 //Long variable right shift arithmetic
6469 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
6470   predicate(Matcher::vector_length(n) <= 4 &&
6471             !VectorNode::is_vshift_cnt(n->in(2)) &&
6472             UseAVX == 2);
6473   match(Set dst (RShiftVL src shift));
6474   effect(TEMP dst, TEMP vtmp);
6475   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
6476   ins_encode %{
6477     int opcode = this->ideal_Opcode();
6478     int vlen_enc = vector_length_encoding(this);
6479     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
6480                  $vtmp$$XMMRegister);
6481   %}
6482   ins_pipe( pipe_slow );
6483 %}
6484 
6485 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
6486   predicate(!VectorNode::is_vshift_cnt(n->in(2)) &&
6487             UseAVX > 2);
6488   match(Set dst (RShiftVL src shift));
6489   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
6490   ins_encode %{
6491     int opcode = this->ideal_Opcode();
6492     int vlen_enc = vector_length_encoding(this);
6493     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6494   %}
6495   ins_pipe( pipe_slow );
6496 %}
6497 
6498 // --------------------------------- AND --------------------------------------
6499 
6500 instruct vand(vec dst, vec src) %{
6501   predicate(UseAVX == 0);
6502   match(Set dst (AndV dst src));
6503   format %{ "pand    $dst,$src\t! and vectors" %}
6504   ins_encode %{
6505     __ pand($dst$$XMMRegister, $src$$XMMRegister);
6506   %}
6507   ins_pipe( pipe_slow );
6508 %}
6509 
6510 instruct vand_reg(vec dst, vec src1, vec src2) %{
6511   predicate(UseAVX > 0);
6512   match(Set dst (AndV src1 src2));
6513   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
6514   ins_encode %{
6515     int vlen_enc = vector_length_encoding(this);
6516     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6517   %}
6518   ins_pipe( pipe_slow );
6519 %}
6520 
6521 instruct vand_mem(vec dst, vec src, memory mem) %{
6522   predicate((UseAVX > 0) &&
6523             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6524   match(Set dst (AndV src (LoadVector mem)));
6525   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
6526   ins_encode %{
6527     int vlen_enc = vector_length_encoding(this);
6528     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6529   %}
6530   ins_pipe( pipe_slow );
6531 %}
6532 
6533 // --------------------------------- OR ---------------------------------------
6534 
6535 instruct vor(vec dst, vec src) %{
6536   predicate(UseAVX == 0);
6537   match(Set dst (OrV dst src));
6538   format %{ "por     $dst,$src\t! or vectors" %}
6539   ins_encode %{
6540     __ por($dst$$XMMRegister, $src$$XMMRegister);
6541   %}
6542   ins_pipe( pipe_slow );
6543 %}
6544 
6545 instruct vor_reg(vec dst, vec src1, vec src2) %{
6546   predicate(UseAVX > 0);
6547   match(Set dst (OrV src1 src2));
6548   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
6549   ins_encode %{
6550     int vlen_enc = vector_length_encoding(this);
6551     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6552   %}
6553   ins_pipe( pipe_slow );
6554 %}
6555 
6556 instruct vor_mem(vec dst, vec src, memory mem) %{
6557   predicate((UseAVX > 0) &&
6558             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6559   match(Set dst (OrV src (LoadVector mem)));
6560   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
6561   ins_encode %{
6562     int vlen_enc = vector_length_encoding(this);
6563     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6564   %}
6565   ins_pipe( pipe_slow );
6566 %}
6567 
6568 // --------------------------------- XOR --------------------------------------
6569 
6570 instruct vxor(vec dst, vec src) %{
6571   predicate(UseAVX == 0);
6572   match(Set dst (XorV dst src));
6573   format %{ "pxor    $dst,$src\t! xor vectors" %}
6574   ins_encode %{
6575     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
6576   %}
6577   ins_pipe( pipe_slow );
6578 %}
6579 
6580 instruct vxor_reg(vec dst, vec src1, vec src2) %{
6581   predicate(UseAVX > 0);
6582   match(Set dst (XorV src1 src2));
6583   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
6584   ins_encode %{
6585     int vlen_enc = vector_length_encoding(this);
6586     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6587   %}
6588   ins_pipe( pipe_slow );
6589 %}
6590 
6591 instruct vxor_mem(vec dst, vec src, memory mem) %{
6592   predicate((UseAVX > 0) &&
6593             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6594   match(Set dst (XorV src (LoadVector mem)));
6595   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
6596   ins_encode %{
6597     int vlen_enc = vector_length_encoding(this);
6598     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6599   %}
6600   ins_pipe( pipe_slow );
6601 %}
6602 
6603 // --------------------------------- VectorCast --------------------------------------
6604 
6605 instruct vcastBtoX(vec dst, vec src) %{
6606   match(Set dst (VectorCastB2X src));
6607   format %{ "vector_cast_b2x $dst,$src\t!" %}
6608   ins_encode %{
6609     assert(UseAVX > 0, "required");
6610 
6611     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6612     int vlen_enc = vector_length_encoding(this);
6613     switch (to_elem_bt) {
6614       case T_SHORT:
6615         __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6616         break;
6617       case T_INT:
6618         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6619         break;
6620       case T_FLOAT:
6621         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6622         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6623         break;
6624       case T_LONG:
6625         __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6626         break;
6627       case T_DOUBLE:
6628         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6629         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6630         break;
6631 
6632       default: assert(false, "%s", type2name(to_elem_bt));
6633     }
6634   %}
6635   ins_pipe( pipe_slow );
6636 %}
6637 
6638 instruct castStoX(vec dst, vec src, rRegP scratch) %{
6639   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6640             Matcher::vector_length(n->in(1)) <= 8 && // src
6641             Matcher::vector_element_basic_type(n) == T_BYTE);
6642   effect(TEMP scratch);
6643   match(Set dst (VectorCastS2X src));
6644   format %{ "vector_cast_s2x $dst,$src\t! using $scratch as TEMP" %}
6645   ins_encode %{
6646     assert(UseAVX > 0, "required");
6647 
6648     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, $scratch$$Register);
6649     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6650   %}
6651   ins_pipe( pipe_slow );
6652 %}
6653 
6654 instruct vcastStoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6655   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6656             Matcher::vector_length(n->in(1)) == 16 && // src
6657             Matcher::vector_element_basic_type(n) == T_BYTE);
6658   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6659   match(Set dst (VectorCastS2X src));
6660   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp, $scratch as TEMP" %}
6661   ins_encode %{
6662     assert(UseAVX > 0, "required");
6663 
6664     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
6665     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6666     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
6667     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6668   %}
6669   ins_pipe( pipe_slow );
6670 %}
6671 
6672 instruct vcastStoX_evex(vec dst, vec src) %{
6673   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
6674             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
6675   match(Set dst (VectorCastS2X src));
6676   format %{ "vector_cast_s2x $dst,$src\t!" %}
6677   ins_encode %{
6678     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6679     int src_vlen_enc = vector_length_encoding(this, $src);
6680     int vlen_enc = vector_length_encoding(this);
6681     switch (to_elem_bt) {
6682       case T_BYTE:
6683         if (!VM_Version::supports_avx512vl()) {
6684           vlen_enc = Assembler::AVX_512bit;
6685         }
6686         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6687         break;
6688       case T_INT:
6689         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6690         break;
6691       case T_FLOAT:
6692         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6693         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6694         break;
6695       case T_LONG:
6696         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6697         break;
6698       case T_DOUBLE:
6699         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6700         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6701         break;
6702       default:
6703         ShouldNotReachHere();
6704     }
6705   %}
6706   ins_pipe( pipe_slow );
6707 %}
6708 
6709 instruct castItoX(vec dst, vec src, rRegP scratch) %{
6710   predicate(UseAVX <= 2 &&
6711             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
6712             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
6713   match(Set dst (VectorCastI2X src));
6714   format %{ "vector_cast_i2x $dst,$src\t! using $scratch as TEMP" %}
6715   effect(TEMP scratch);
6716   ins_encode %{
6717     assert(UseAVX > 0, "required");
6718 
6719     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6720     int vlen_enc = vector_length_encoding(this, $src);
6721 
6722     if (to_elem_bt == T_BYTE) {
6723       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
6724       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6725       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6726     } else {
6727       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
6728       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6729       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6730     }
6731   %}
6732   ins_pipe( pipe_slow );
6733 %}
6734 
6735 instruct vcastItoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6736   predicate(UseAVX <= 2 &&
6737             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
6738             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
6739   match(Set dst (VectorCastI2X src));
6740   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp and $scratch as TEMP" %}
6741   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6742   ins_encode %{
6743     assert(UseAVX > 0, "required");
6744 
6745     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6746     int vlen_enc = vector_length_encoding(this, $src);
6747 
6748     if (to_elem_bt == T_BYTE) {
6749       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
6750       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
6751       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6752       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6753     } else {
6754       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
6755       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6756       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
6757       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6758     }
6759   %}
6760   ins_pipe( pipe_slow );
6761 %}
6762 
6763 instruct vcastItoX_evex(vec dst, vec src) %{
6764   predicate(UseAVX > 2 ||
6765             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
6766   match(Set dst (VectorCastI2X src));
6767   format %{ "vector_cast_i2x $dst,$src\t!" %}
6768   ins_encode %{
6769     assert(UseAVX > 0, "required");
6770 
6771     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
6772     int src_vlen_enc = vector_length_encoding(this, $src);
6773     int dst_vlen_enc = vector_length_encoding(this);
6774     switch (dst_elem_bt) {
6775       case T_BYTE:
6776         if (!VM_Version::supports_avx512vl()) {
6777           src_vlen_enc = Assembler::AVX_512bit;
6778         }
6779         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6780         break;
6781       case T_SHORT:
6782         if (!VM_Version::supports_avx512vl()) {
6783           src_vlen_enc = Assembler::AVX_512bit;
6784         }
6785         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6786         break;
6787       case T_FLOAT:
6788         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
6789         break;
6790       case T_LONG:
6791         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
6792         break;
6793       case T_DOUBLE:
6794         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
6795         break;
6796       default:
6797         ShouldNotReachHere();
6798     }
6799   %}
6800   ins_pipe( pipe_slow );
6801 %}
6802 
6803 instruct vcastLtoBS(vec dst, vec src, rRegP scratch) %{
6804   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
6805             UseAVX <= 2);
6806   match(Set dst (VectorCastL2X src));
6807   effect(TEMP scratch);
6808   format %{ "vector_cast_l2x  $dst,$src\t! using $scratch as TEMP" %}
6809   ins_encode %{
6810     assert(UseAVX > 0, "required");
6811 
6812     int vlen = Matcher::vector_length_in_bytes(this, $src);
6813     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
6814     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
6815                                                       : ExternalAddress(vector_int_to_short_mask());
6816     if (vlen <= 16) {
6817       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
6818       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
6819       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6820     } else {
6821       assert(vlen <= 32, "required");
6822       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
6823       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
6824       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
6825       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6826     }
6827     if (to_elem_bt == T_BYTE) {
6828       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6829     }
6830   %}
6831   ins_pipe( pipe_slow );
6832 %}
6833 
6834 instruct vcastLtoX_evex(vec dst, vec src) %{
6835   predicate(UseAVX > 2 ||
6836             (Matcher::vector_element_basic_type(n) == T_INT ||
6837              Matcher::vector_element_basic_type(n) == T_FLOAT ||
6838              Matcher::vector_element_basic_type(n) == T_DOUBLE));
6839   match(Set dst (VectorCastL2X src));
6840   format %{ "vector_cast_l2x  $dst,$src\t!" %}
6841   ins_encode %{
6842     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6843     int vlen = Matcher::vector_length_in_bytes(this, $src);
6844     int vlen_enc = vector_length_encoding(this, $src);
6845     switch (to_elem_bt) {
6846       case T_BYTE:
6847         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
6848           vlen_enc = Assembler::AVX_512bit;
6849         }
6850         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6851         break;
6852       case T_SHORT:
6853         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
6854           vlen_enc = Assembler::AVX_512bit;
6855         }
6856         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6857         break;
6858       case T_INT:
6859         if (vlen == 8) {
6860           if ($dst$$XMMRegister != $src$$XMMRegister) {
6861             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6862           }
6863         } else if (vlen == 16) {
6864           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
6865         } else if (vlen == 32) {
6866           if (UseAVX > 2) {
6867             if (!VM_Version::supports_avx512vl()) {
6868               vlen_enc = Assembler::AVX_512bit;
6869             }
6870             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6871           } else {
6872             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
6873             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
6874           }
6875         } else { // vlen == 64
6876           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6877         }
6878         break;
6879       case T_FLOAT:
6880         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
6881         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6882         break;
6883       case T_DOUBLE:
6884         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
6885         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6886         break;
6887 
6888       default: assert(false, "%s", type2name(to_elem_bt));
6889     }
6890   %}
6891   ins_pipe( pipe_slow );
6892 %}
6893 
6894 instruct vcastFtoD_reg(vec dst, vec src) %{
6895   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
6896   match(Set dst (VectorCastF2X src));
6897   format %{ "vector_cast_f2x  $dst,$src\t!" %}
6898   ins_encode %{
6899     int vlen_enc = vector_length_encoding(this);
6900     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6901   %}
6902   ins_pipe( pipe_slow );
6903 %}
6904 
6905 instruct vcastDtoF_reg(vec dst, vec src) %{
6906   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
6907   match(Set dst (VectorCastD2X src));
6908   format %{ "vector_cast_d2x  $dst,$src\t!" %}
6909   ins_encode %{
6910     int vlen_enc = vector_length_encoding(this, $src);
6911     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6912   %}
6913   ins_pipe( pipe_slow );
6914 %}
6915 
6916 // --------------------------------- VectorMaskCmp --------------------------------------
6917 
6918 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
6919   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
6920             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
6921             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
6922   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6923   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
6924   ins_encode %{
6925     int vlen_enc = vector_length_encoding(this, $src1);
6926     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
6927     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
6928       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6929     } else {
6930       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6931     }
6932   %}
6933   ins_pipe( pipe_slow );
6934 %}
6935 
6936 instruct evcmpFD(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
6937   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
6938             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
6939   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6940   effect(TEMP scratch, TEMP ktmp);
6941   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6942   ins_encode %{
6943     int vlen_enc = Assembler::AVX_512bit;
6944     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
6945     KRegister mask = k0; // The comparison itself is not being masked.
6946     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
6947       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6948       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
6949     } else {
6950       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6951       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
6952     }
6953   %}
6954   ins_pipe( pipe_slow );
6955 %}
6956 
6957 instruct vcmp(legVec dst, legVec src1, legVec src2, immI8 cond, rRegP scratch) %{
6958   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vl()) &&
6959             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
6960             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
6961             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
6962             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
6963   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6964   effect(TEMP scratch);
6965   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6966   ins_encode %{
6967     int vlen_enc = vector_length_encoding(this, $src1);
6968     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
6969     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
6970     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, ww, vlen_enc, $scratch$$Register);
6971   %}
6972   ins_pipe( pipe_slow );
6973 %}
6974 
6975 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
6976   predicate((UseAVX == 2 || !VM_Version::supports_avx512vl()) &&
6977             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
6978             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
6979             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 16 && // src1
6980             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
6981   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6982   effect(TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6983   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6984   ins_encode %{
6985     int vlen = Matcher::vector_length_in_bytes(this, $src1);
6986     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
6987     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
6988     __ vpcmpu(bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen, $vtmp1$$XMMRegister,
6989               $vtmp2$$XMMRegister, $scratch$$Register);
6990   %}
6991   ins_pipe( pipe_slow );
6992 %}
6993 
6994 instruct vcmpu32(legVec dst, legVec src1, legVec src2, immI8 cond, legVec vtmp1, legVec vtmp2, legVec vtmp3, rRegP scratch) %{
6995   predicate((UseAVX == 2 || !VM_Version::supports_avx512vl()) &&
6996             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
6997             Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 32 && // src1
6998             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
6999   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7000   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP scratch);
7001   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7002   ins_encode %{
7003     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7004     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7005     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
7006     __ vpcmpu32(bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen, $vtmp1$$XMMRegister,
7007                 $vtmp2$$XMMRegister, $vtmp3$$XMMRegister, $scratch$$Register);
7008   %}
7009   ins_pipe( pipe_slow );
7010 %}
7011 
7012 instruct evcmp(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
7013   predicate(UseAVX > 2 &&
7014             (VM_Version::supports_avx512vl() ||
7015              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
7016              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7017   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7018   effect(TEMP scratch, TEMP ktmp);
7019   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7020   ins_encode %{
7021     assert(UseAVX > 2, "required");
7022 
7023     int vlen_enc = vector_length_encoding(this, $src1);
7024     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7025     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
7026     KRegister mask = k0; // The comparison itself is not being masked.
7027     bool merge = false;
7028     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
7029 
7030     switch (src1_elem_bt) {
7031       case T_BYTE: {
7032         __ evpcmpb($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7033         __ evmovdqub($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7034         break;
7035       }
7036       case T_SHORT: {
7037         __ evpcmpw($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7038         __ evmovdquw($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7039         break;
7040       }
7041       case T_INT: {
7042         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7043         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7044         break;
7045       }
7046       case T_LONG: {
7047         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7048         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7049         break;
7050       }
7051       default: assert(false, "%s", type2name(src1_elem_bt));
7052     }
7053   %}
7054   ins_pipe( pipe_slow );
7055 %}
7056 
7057 // Extract
7058 
7059 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
7060   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
7061   match(Set dst (ExtractI src idx));
7062   match(Set dst (ExtractS src idx));
7063 #ifdef _LP64
7064   match(Set dst (ExtractB src idx));
7065 #endif
7066   format %{ "extractI $dst,$src,$idx\t!" %}
7067   ins_encode %{
7068     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7069 
7070     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7071     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7072   %}
7073   ins_pipe( pipe_slow );
7074 %}
7075 
7076 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
7077   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
7078             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
7079   match(Set dst (ExtractI src idx));
7080   match(Set dst (ExtractS src idx));
7081 #ifdef _LP64
7082   match(Set dst (ExtractB src idx));
7083 #endif
7084   effect(TEMP vtmp);
7085   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
7086   ins_encode %{
7087     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7088 
7089     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7090     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7091     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
7092   %}
7093   ins_pipe( pipe_slow );
7094 %}
7095 
7096 #ifdef _LP64
7097 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
7098   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
7099   match(Set dst (ExtractL src idx));
7100   format %{ "extractL $dst,$src,$idx\t!" %}
7101   ins_encode %{
7102     assert(UseSSE >= 4, "required");
7103     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7104 
7105     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7106   %}
7107   ins_pipe( pipe_slow );
7108 %}
7109 
7110 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
7111   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7112             Matcher::vector_length(n->in(1)) == 8);  // src
7113   match(Set dst (ExtractL src idx));
7114   effect(TEMP vtmp);
7115   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
7116   ins_encode %{
7117     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7118 
7119     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7120     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
7121   %}
7122   ins_pipe( pipe_slow );
7123 %}
7124 #endif
7125 
7126 instruct extractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7127   predicate(Matcher::vector_length(n->in(1)) <= 4);
7128   match(Set dst (ExtractF src idx));
7129   effect(TEMP dst, TEMP tmp, TEMP vtmp);
7130   format %{ "extractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7131   ins_encode %{
7132     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7133 
7134     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $tmp$$Register, $vtmp$$XMMRegister);
7135   %}
7136   ins_pipe( pipe_slow );
7137 %}
7138 
7139 instruct vextractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7140   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
7141             Matcher::vector_length(n->in(1)/*src*/) == 16);
7142   match(Set dst (ExtractF src idx));
7143   effect(TEMP tmp, TEMP vtmp);
7144   format %{ "vextractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7145   ins_encode %{
7146     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7147 
7148     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7149     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant, $tmp$$Register);
7150   %}
7151   ins_pipe( pipe_slow );
7152 %}
7153 
7154 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
7155   predicate(Matcher::vector_length(n->in(1)) == 2); // src
7156   match(Set dst (ExtractD src idx));
7157   format %{ "extractD $dst,$src,$idx\t!" %}
7158   ins_encode %{
7159     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7160 
7161     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7162   %}
7163   ins_pipe( pipe_slow );
7164 %}
7165 
7166 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
7167   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7168             Matcher::vector_length(n->in(1)) == 8);  // src
7169   match(Set dst (ExtractD src idx));
7170   effect(TEMP vtmp);
7171   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
7172   ins_encode %{
7173     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7174 
7175     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7176     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
7177   %}
7178   ins_pipe( pipe_slow );
7179 %}
7180 
7181 // --------------------------------- Vector Blend --------------------------------------
7182 
7183 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
7184   predicate(UseAVX == 0);
7185   match(Set dst (VectorBlend (Binary dst src) mask));
7186   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
7187   effect(TEMP tmp);
7188   ins_encode %{
7189     assert(UseSSE >= 4, "required");
7190 
7191     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
7192       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
7193     }
7194     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
7195   %}
7196   ins_pipe( pipe_slow );
7197 %}
7198 
7199 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
7200   predicate(UseAVX > 0 &&
7201             Matcher::vector_length_in_bytes(n) <= 32 &&
7202             is_integral_type(Matcher::vector_element_basic_type(n)));
7203   match(Set dst (VectorBlend (Binary src1 src2) mask));
7204   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7205   ins_encode %{
7206     int vlen_enc = vector_length_encoding(this);
7207     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7208   %}
7209   ins_pipe( pipe_slow );
7210 %}
7211 
7212 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
7213   predicate(UseAVX > 0 &&
7214             Matcher::vector_length_in_bytes(n) <= 32 &&
7215             !is_integral_type(Matcher::vector_element_basic_type(n)));
7216   match(Set dst (VectorBlend (Binary src1 src2) mask));
7217   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7218   ins_encode %{
7219     int vlen_enc = vector_length_encoding(this);
7220     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7221   %}
7222   ins_pipe( pipe_slow );
7223 %}
7224 
7225 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, rRegP scratch, kReg ktmp) %{
7226   predicate(Matcher::vector_length_in_bytes(n) == 64);
7227   match(Set dst (VectorBlend (Binary src1 src2) mask));
7228   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
7229   effect(TEMP scratch, TEMP ktmp);
7230   ins_encode %{
7231      int vlen_enc = Assembler::AVX_512bit;
7232      BasicType elem_bt = Matcher::vector_element_basic_type(this);
7233     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, $scratch$$Register);
7234     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
7235   %}
7236   ins_pipe( pipe_slow );
7237 %}
7238 
7239 // --------------------------------- ABS --------------------------------------
7240 // a = |a|
7241 instruct vabsB_reg(vec dst, vec src) %{
7242   match(Set dst (AbsVB  src));
7243   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
7244   ins_encode %{
7245     uint vlen = Matcher::vector_length(this);
7246     if (vlen <= 16) {
7247       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7248     } else {
7249       int vlen_enc = vector_length_encoding(this);
7250       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7251     }
7252   %}
7253   ins_pipe( pipe_slow );
7254 %}
7255 
7256 instruct vabsS_reg(vec dst, vec src) %{
7257   match(Set dst (AbsVS  src));
7258   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
7259   ins_encode %{
7260     uint vlen = Matcher::vector_length(this);
7261     if (vlen <= 8) {
7262       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7263     } else {
7264       int vlen_enc = vector_length_encoding(this);
7265       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7266     }
7267   %}
7268   ins_pipe( pipe_slow );
7269 %}
7270 
7271 instruct vabsI_reg(vec dst, vec src) %{
7272   match(Set dst (AbsVI  src));
7273   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
7274   ins_encode %{
7275     uint vlen = Matcher::vector_length(this);
7276     if (vlen <= 4) {
7277       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7278     } else {
7279       int vlen_enc = vector_length_encoding(this);
7280       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7281     }
7282   %}
7283   ins_pipe( pipe_slow );
7284 %}
7285 
7286 instruct vabsL_reg(vec dst, vec src) %{
7287   match(Set dst (AbsVL  src));
7288   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
7289   ins_encode %{
7290     assert(UseAVX > 2, "required");
7291     int vlen_enc = vector_length_encoding(this);
7292     if (!VM_Version::supports_avx512vl()) {
7293       vlen_enc = Assembler::AVX_512bit;
7294     }
7295     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7296   %}
7297   ins_pipe( pipe_slow );
7298 %}
7299 
7300 // --------------------------------- ABSNEG --------------------------------------
7301 
7302 instruct vabsnegF(vec dst, vec src, rRegI scratch) %{
7303   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
7304   match(Set dst (AbsVF src));
7305   match(Set dst (NegVF src));
7306   effect(TEMP scratch);
7307   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
7308   ins_cost(150);
7309   ins_encode %{
7310     int opcode = this->ideal_Opcode();
7311     int vlen = Matcher::vector_length(this);
7312     if (vlen == 2) {
7313       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7314     } else {
7315       assert(vlen == 8 || vlen == 16, "required");
7316       int vlen_enc = vector_length_encoding(this);
7317       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7318     }
7319   %}
7320   ins_pipe( pipe_slow );
7321 %}
7322 
7323 instruct vabsneg4F(vec dst, rRegI scratch) %{
7324   predicate(Matcher::vector_length(n) == 4);
7325   match(Set dst (AbsVF dst));
7326   match(Set dst (NegVF dst));
7327   effect(TEMP scratch);
7328   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
7329   ins_cost(150);
7330   ins_encode %{
7331     int opcode = this->ideal_Opcode();
7332     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $scratch$$Register);
7333   %}
7334   ins_pipe( pipe_slow );
7335 %}
7336 
7337 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
7338   match(Set dst (AbsVD  src));
7339   match(Set dst (NegVD  src));
7340   effect(TEMP scratch);
7341   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
7342   ins_encode %{
7343     int opcode = this->ideal_Opcode();
7344     uint vlen = Matcher::vector_length(this);
7345     if (vlen == 2) {
7346       assert(UseSSE >= 2, "required");
7347       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7348     } else {
7349       int vlen_enc = vector_length_encoding(this);
7350       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7351     }
7352   %}
7353   ins_pipe( pipe_slow );
7354 %}
7355 
7356 //------------------------------------- VectorTest --------------------------------------------
7357 
7358 #ifdef _LP64
7359 instruct vptest_alltrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp1, legVec vtmp2, rFlagsReg cr) %{
7360   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
7361             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
7362             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7363   match(Set dst (VectorTest src1 src2 ));
7364   effect(TEMP vtmp1, TEMP vtmp2, KILL cr);
7365   format %{ "vector_test $dst,$src1, $src2\t! using $vtmp1, $vtmp2 and $cr as TEMP" %}
7366   ins_encode %{
7367     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7368     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
7369     __ setb(Assembler::carrySet, $dst$$Register);
7370     __ movzbl($dst$$Register, $dst$$Register);
7371   %}
7372   ins_pipe( pipe_slow );
7373 %}
7374 
7375 instruct vptest_alltrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7376   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
7377             Matcher::vector_length_in_bytes(n->in(1)) <  64 &&
7378             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7379   match(Set dst (VectorTest src1 src2 ));
7380   effect(KILL cr);
7381   format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %}
7382   ins_encode %{
7383     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7384     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7385     __ setb(Assembler::carrySet, $dst$$Register);
7386     __ movzbl($dst$$Register, $dst$$Register);
7387   %}
7388   ins_pipe( pipe_slow );
7389 %}
7390 
7391 instruct vptest_alltrue_evex(rRegI dst, legVec src1, legVec src2, kReg ktmp, rFlagsReg cr) %{
7392   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 64 &&
7393             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7394   match(Set dst (VectorTest src1 src2 ));
7395   effect(KILL cr, TEMP ktmp);
7396   format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %}
7397   ins_encode %{
7398     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7399     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, $ktmp$$KRegister);
7400     __ setb(Assembler::carrySet, $dst$$Register);
7401     __ movzbl($dst$$Register, $dst$$Register);
7402   %}
7403   ins_pipe( pipe_slow );
7404 %}
7405 
7406 instruct vptest_anytrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp, rFlagsReg cr) %{
7407   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
7408             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
7409             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7410   match(Set dst (VectorTest src1 src2 ));
7411   effect(TEMP vtmp, KILL cr);
7412   format %{ "vector_test_any_true $dst,$src1,$src2\t! using $vtmp, $cr as TEMP" %}
7413   ins_encode %{
7414     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7415     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7416     __ setb(Assembler::notZero, $dst$$Register);
7417     __ movzbl($dst$$Register, $dst$$Register);
7418   %}
7419   ins_pipe( pipe_slow );
7420 %}
7421 
7422 instruct vptest_anytrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7423   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
7424             Matcher::vector_length_in_bytes(n->in(1)) < 64  &&
7425             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7426   match(Set dst (VectorTest src1 src2 ));
7427   effect(KILL cr);
7428   format %{ "vector_test_any_true $dst,$src1,$src2\t! using $cr as TEMP" %}
7429   ins_encode %{
7430     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7431     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7432     __ setb(Assembler::notZero, $dst$$Register);
7433     __ movzbl($dst$$Register, $dst$$Register);
7434   %}
7435   ins_pipe( pipe_slow );
7436 %}
7437 
7438 instruct vptest_anytrue_evex(rRegI dst, legVec src1, legVec src2, kReg ktmp, rFlagsReg cr) %{
7439   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 64 &&
7440             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7441   match(Set dst (VectorTest src1 src2 ));
7442   effect(KILL cr, TEMP ktmp);
7443   format %{ "vector_test_any_true $dst,$src1,$src2\t! using $cr as TEMP" %}
7444   ins_encode %{
7445     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7446     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, $ktmp$$KRegister);
7447     __ setb(Assembler::notZero, $dst$$Register);
7448     __ movzbl($dst$$Register, $dst$$Register);
7449   %}
7450   ins_pipe( pipe_slow );
7451 %}
7452 
7453 instruct cmpvptest_anytrue_lt16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero, legVec vtmp) %{
7454   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 4 &&
7455             Matcher::vector_length_in_bytes(n->in(1)->in(1)) < 16 &&
7456             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7457   match(Set cr (CmpI (VectorTest src1 src2) zero));
7458   effect(TEMP vtmp);
7459   format %{ "cmp_vector_test_any_true $src1,$src2\t! using $vtmp as TEMP" %}
7460   ins_encode %{
7461     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7462     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7463   %}
7464   ins_pipe( pipe_slow );
7465 %}
7466 
7467 instruct cmpvptest_anytrue(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero) %{
7468   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 16 &&
7469             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <  64 &&
7470             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7471   match(Set cr (CmpI (VectorTest src1 src2) zero));
7472   format %{ "cmp_vector_test_any_true $src1,$src2\t!" %}
7473   ins_encode %{
7474     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7475     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7476   %}
7477   ins_pipe( pipe_slow );
7478 %}
7479 
7480 instruct cmpvptest_anytrue_evex(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero, kReg ktmp) %{
7481   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 &&
7482             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7483   match(Set cr (CmpI (VectorTest src1 src2) zero));
7484   effect(TEMP ktmp);
7485   format %{ "cmp_vector_test_any_true $src1,$src2\t!" %}
7486   ins_encode %{
7487     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7488     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, $ktmp$$KRegister);
7489   %}
7490   ins_pipe( pipe_slow );
7491 %}
7492 #endif
7493 
7494 //------------------------------------- LoadMask --------------------------------------------
7495 
7496 instruct loadMask(legVec dst, legVec src) %{
7497   predicate(!VM_Version::supports_avx512vlbw());
7498   match(Set dst (VectorLoadMask src));
7499   effect(TEMP dst);
7500   format %{ "vector_loadmask_byte $dst,$src\n\t" %}
7501   ins_encode %{
7502     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
7503     BasicType elem_bt = Matcher::vector_element_basic_type(this);
7504 
7505     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
7506   %}
7507   ins_pipe( pipe_slow );
7508 %}
7509 
7510 instruct loadMask_evex(vec dst, vec src) %{
7511   predicate(VM_Version::supports_avx512vlbw());
7512   match(Set dst (VectorLoadMask src));
7513   effect(TEMP dst);
7514   format %{ "vector_loadmask_byte $dst,$src\n\t" %}
7515   ins_encode %{
7516     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
7517     BasicType elem_bt = Matcher::vector_element_basic_type(this);
7518 
7519     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, false);
7520   %}
7521   ins_pipe( pipe_slow );
7522 %}
7523 
7524 //------------------------------------- StoreMask --------------------------------------------
7525 
7526 instruct storeMask1B(vec dst, vec src, immI_1 size) %{
7527   predicate(Matcher::vector_length(n) < 64 || VM_Version::supports_avx512vlbw());
7528   match(Set dst (VectorStoreMask src size));
7529   format %{ "vector_store_mask $dst,$src\t!" %}
7530   ins_encode %{
7531     assert(UseSSE >= 3, "required");
7532     if (Matcher::vector_length_in_bytes(this) <= 16) {
7533       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7534     } else {
7535       assert(UseAVX >= 2, "required");
7536       int src_vlen_enc = vector_length_encoding(this, $src);
7537       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7538     }
7539   %}
7540   ins_pipe( pipe_slow );
7541 %}
7542 
7543 instruct storeMask2B(vec dst, vec src, immI_2 size) %{
7544   predicate(Matcher::vector_length(n) <= 8);
7545   match(Set dst (VectorStoreMask src size));
7546   format %{ "vector_store_mask $dst,$src\n\t" %}
7547   ins_encode %{
7548     assert(UseSSE >= 3, "required");
7549     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7550     __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7551   %}
7552   ins_pipe( pipe_slow );
7553 %}
7554 
7555 instruct vstoreMask2B(vec dst, vec src, immI_2 size) %{
7556   predicate(Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
7557   match(Set dst (VectorStoreMask src size));
7558   effect(TEMP dst);
7559   format %{ "vector_store_mask $dst,$src\t!" %}
7560   ins_encode %{
7561     int vlen_enc = Assembler::AVX_128bit;
7562     __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
7563     __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister,vlen_enc);
7564     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7565   %}
7566   ins_pipe( pipe_slow );
7567 %}
7568 
7569 instruct vstoreMask2B_evex(vec dst, vec src, immI_2 size) %{
7570   predicate(VM_Version::supports_avx512bw());
7571   match(Set dst (VectorStoreMask src size));
7572   format %{ "vector_store_mask $dst,$src\t!" %}
7573   ins_encode %{
7574     int src_vlen_enc = vector_length_encoding(this, $src);
7575     int dst_vlen_enc = vector_length_encoding(this);
7576     __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7577     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7578   %}
7579   ins_pipe( pipe_slow );
7580 %}
7581 
7582 instruct storeMask4B(vec dst, vec src, immI_4 size) %{
7583   predicate(Matcher::vector_length(n) <= 4 && UseAVX <= 2);
7584   match(Set dst (VectorStoreMask src size));
7585   format %{ "vector_store_mask $dst,$src\t!" %}
7586   ins_encode %{
7587     assert(UseSSE >= 3, "required");
7588     __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7589     __ packssdw($dst$$XMMRegister, $dst$$XMMRegister);
7590     __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7591   %}
7592   ins_pipe( pipe_slow );
7593 %}
7594 
7595 instruct vstoreMask4B(vec dst, vec src, immI_4 size) %{
7596   predicate(Matcher::vector_length(n) == 8 && UseAVX <= 2);
7597   match(Set dst (VectorStoreMask src size));
7598   format %{ "vector_store_mask $dst,$src\t!" %}
7599   effect(TEMP dst);
7600   ins_encode %{
7601     int vlen_enc = Assembler::AVX_128bit;
7602     __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
7603     __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7604     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7605     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7606   %}
7607   ins_pipe( pipe_slow );
7608 %}
7609 
7610 instruct vstoreMask4B_evex(vec dst, vec src, immI_4 size) %{
7611   predicate(UseAVX > 2);
7612   match(Set dst (VectorStoreMask src size));
7613   format %{ "vector_store_mask $dst,$src\t!" %}
7614   ins_encode %{
7615     int src_vlen_enc = vector_length_encoding(this, $src);
7616     int dst_vlen_enc = vector_length_encoding(this);
7617     if (!VM_Version::supports_avx512vl()) {
7618       src_vlen_enc = Assembler::AVX_512bit;
7619     }
7620     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7621     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7622   %}
7623   ins_pipe( pipe_slow );
7624 %}
7625 
7626 instruct storeMask8B(vec dst, vec src, immI_8 size) %{
7627   predicate(Matcher::vector_length(n) == 2 && UseAVX <= 2);
7628   match(Set dst (VectorStoreMask src size));
7629   format %{ "vector_store_mask $dst,$src\t!" %}
7630   ins_encode %{
7631     assert(UseSSE >= 3, "required");
7632     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
7633     __ packssdw($dst$$XMMRegister, $dst$$XMMRegister);
7634     __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7635     __ pabsb($dst$$XMMRegister, $dst$$XMMRegister);
7636   %}
7637   ins_pipe( pipe_slow );
7638 %}
7639 
7640 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, legVec vtmp) %{
7641   predicate(Matcher::vector_length(n) == 4 && UseAVX <= 2);
7642   match(Set dst (VectorStoreMask src size));
7643   format %{ "vector_store_mask $dst,$src\t! using $vtmp as TEMP" %}
7644   effect(TEMP dst, TEMP vtmp);
7645   ins_encode %{
7646     int vlen_enc = Assembler::AVX_128bit;
7647     __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
7648     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
7649     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
7650     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7651     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7652     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7653   %}
7654   ins_pipe( pipe_slow );
7655 %}
7656 
7657 instruct vstoreMask8B_evex(vec dst, vec src, immI_8 size) %{
7658   predicate(UseAVX > 2);
7659   match(Set dst (VectorStoreMask src size));
7660   format %{ "vector_store_mask $dst,$src\t!" %}
7661   ins_encode %{
7662     int src_vlen_enc = vector_length_encoding(this, $src);
7663     int dst_vlen_enc = vector_length_encoding(this);
7664     if (!VM_Version::supports_avx512vl()) {
7665       src_vlen_enc = Assembler::AVX_512bit;
7666     }
7667     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7668     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7669   %}
7670   ins_pipe( pipe_slow );
7671 %}
7672 
7673 instruct vmaskcast(vec dst) %{
7674   predicate((Matcher::vector_length(n) == Matcher::vector_length(n->in(1))) &&
7675             (Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))));
7676   match(Set dst (VectorMaskCast dst));
7677   ins_cost(0);
7678   format %{ "vector_mask_cast $dst" %}
7679   ins_encode %{
7680     // empty
7681   %}
7682   ins_pipe(empty);
7683 %}
7684 
7685 //-------------------------------- Load Iota Indices ----------------------------------
7686 
7687 instruct loadIotaIndices(vec dst, immI_0 src, rRegP scratch) %{
7688   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
7689   match(Set dst (VectorLoadConst src));
7690   effect(TEMP scratch);
7691   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
7692   ins_encode %{
7693      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
7694      __ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen_in_bytes);
7695   %}
7696   ins_pipe( pipe_slow );
7697 %}
7698 
7699 //-------------------------------- Rearrange ----------------------------------
7700 
7701 // LoadShuffle/Rearrange for Byte
7702 
7703 instruct loadShuffleB(vec dst) %{
7704   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
7705   match(Set dst (VectorLoadShuffle dst));
7706   format %{ "vector_load_shuffle $dst, $dst" %}
7707   ins_encode %{
7708     // empty
7709   %}
7710   ins_pipe( pipe_slow );
7711 %}
7712 
7713 instruct rearrangeB(vec dst, vec shuffle) %{
7714   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
7715             Matcher::vector_length(n) < 32);
7716   match(Set dst (VectorRearrange dst shuffle));
7717   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7718   ins_encode %{
7719     assert(UseSSE >= 4, "required");
7720     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7721   %}
7722   ins_pipe( pipe_slow );
7723 %}
7724 
7725 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
7726   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
7727             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
7728   match(Set dst (VectorRearrange src shuffle));
7729   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
7730   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
7731   ins_encode %{
7732     assert(UseAVX >= 2, "required");
7733     // Swap src into vtmp1
7734     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
7735     // Shuffle swapped src to get entries from other 128 bit lane
7736     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7737     // Shuffle original src to get entries from self 128 bit lane
7738     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7739     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
7740     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
7741     // Perform the blend
7742     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
7743   %}
7744   ins_pipe( pipe_slow );
7745 %}
7746 
7747 instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{
7748   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
7749             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
7750   match(Set dst (VectorRearrange src shuffle));
7751   format %{ "vector_rearrange $dst, $shuffle, $src" %}
7752   ins_encode %{
7753     int vlen_enc = vector_length_encoding(this);
7754     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7755   %}
7756   ins_pipe( pipe_slow );
7757 %}
7758 
7759 // LoadShuffle/Rearrange for Short
7760 
7761 instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{
7762   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
7763             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
7764   match(Set dst (VectorLoadShuffle src));
7765   effect(TEMP dst, TEMP vtmp, TEMP scratch);
7766   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7767   ins_encode %{
7768     // Create a byte shuffle mask from short shuffle mask
7769     // only byte shuffle instruction available on these platforms
7770     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
7771     if (UseAVX == 0) {
7772       assert(vlen_in_bytes <= 16, "required");
7773       // Multiply each shuffle by two to get byte index
7774       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
7775       __ psllw($vtmp$$XMMRegister, 1);
7776 
7777       // Duplicate to create 2 copies of byte index
7778       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
7779       __ psllw($dst$$XMMRegister, 8);
7780       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
7781 
7782       // Add one to get alternate byte index
7783       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
7784       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
7785     } else {
7786       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
7787       int vlen_enc = vector_length_encoding(this);
7788       // Multiply each shuffle by two to get byte index
7789       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
7790       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
7791 
7792       // Duplicate to create 2 copies of byte index
7793       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
7794       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
7795 
7796       // Add one to get alternate byte index
7797       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, $scratch$$Register);
7798     }
7799   %}
7800   ins_pipe( pipe_slow );
7801 %}
7802 
7803 instruct rearrangeS(vec dst, vec shuffle) %{
7804   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
7805             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
7806   match(Set dst (VectorRearrange dst shuffle));
7807   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7808   ins_encode %{
7809     assert(UseSSE >= 4, "required");
7810     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7811   %}
7812   ins_pipe( pipe_slow );
7813 %}
7814 
7815 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
7816   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
7817             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
7818   match(Set dst (VectorRearrange src shuffle));
7819   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
7820   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
7821   ins_encode %{
7822     assert(UseAVX >= 2, "required");
7823     // Swap src into vtmp1
7824     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
7825     // Shuffle swapped src to get entries from other 128 bit lane
7826     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7827     // Shuffle original src to get entries from self 128 bit lane
7828     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7829     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
7830     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
7831     // Perform the blend
7832     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
7833   %}
7834   ins_pipe( pipe_slow );
7835 %}
7836 
7837 instruct loadShuffleS_evex(vec dst, vec src) %{
7838   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
7839             VM_Version::supports_avx512bw());
7840   match(Set dst (VectorLoadShuffle src));
7841   format %{ "vector_load_shuffle $dst, $src" %}
7842   ins_encode %{
7843     int vlen_enc = vector_length_encoding(this);
7844     if (!VM_Version::supports_avx512vl()) {
7845       vlen_enc = Assembler::AVX_512bit;
7846     }
7847     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7848   %}
7849   ins_pipe( pipe_slow );
7850 %}
7851 
7852 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
7853   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
7854             VM_Version::supports_avx512bw());
7855   match(Set dst (VectorRearrange src shuffle));
7856   format %{ "vector_rearrange $dst, $shuffle, $src" %}
7857   ins_encode %{
7858     int vlen_enc = vector_length_encoding(this);
7859     if (!VM_Version::supports_avx512vl()) {
7860       vlen_enc = Assembler::AVX_512bit;
7861     }
7862     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7863   %}
7864   ins_pipe( pipe_slow );
7865 %}
7866 
7867 // LoadShuffle/Rearrange for Integer and Float
7868 
7869 instruct loadShuffleI(vec dst, vec src, vec vtmp, rRegP scratch) %{
7870   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
7871             Matcher::vector_length(n) == 4 && UseAVX < 2);
7872   match(Set dst (VectorLoadShuffle src));
7873   effect(TEMP dst, TEMP vtmp, TEMP scratch);
7874   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7875   ins_encode %{
7876     assert(UseSSE >= 4, "required");
7877 
7878     // Create a byte shuffle mask from int shuffle mask
7879     // only byte shuffle instruction available on these platforms
7880 
7881     // Duplicate and multiply each shuffle by 4
7882     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
7883     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
7884     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
7885     __ psllw($vtmp$$XMMRegister, 2);
7886 
7887     // Duplicate again to create 4 copies of byte index
7888     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
7889     __ psllw($dst$$XMMRegister, 8);
7890     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
7891 
7892     // Add 3,2,1,0 to get alternate byte index
7893     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), $scratch$$Register);
7894     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
7895   %}
7896   ins_pipe( pipe_slow );
7897 %}
7898 
7899 instruct rearrangeI(vec dst, vec shuffle) %{
7900  predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
7901            Matcher::vector_length(n) == 4 && UseAVX < 2);
7902   match(Set dst (VectorRearrange dst shuffle));
7903   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7904   ins_encode %{
7905     assert(UseSSE >= 4, "required");
7906     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7907   %}
7908   ins_pipe( pipe_slow );
7909 %}
7910 
7911 instruct loadShuffleI_avx(vec dst, vec src) %{
7912   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
7913             UseAVX >= 2);
7914   match(Set dst (VectorLoadShuffle src));
7915   format %{ "vector_load_shuffle $dst, $src" %}
7916   ins_encode %{
7917   int vlen_enc = vector_length_encoding(this);
7918     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7919   %}
7920   ins_pipe( pipe_slow );
7921 %}
7922 
7923 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
7924   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
7925             UseAVX >= 2);
7926   match(Set dst (VectorRearrange src shuffle));
7927   format %{ "vector_rearrange $dst, $shuffle, $src" %}
7928   ins_encode %{
7929     int vlen_enc = vector_length_encoding(this);
7930     if (vlen_enc == Assembler::AVX_128bit) {
7931       vlen_enc = Assembler::AVX_256bit;
7932     }
7933     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7934   %}
7935   ins_pipe( pipe_slow );
7936 %}
7937 
7938 // LoadShuffle/Rearrange for Long and Double
7939 
7940 instruct loadShuffleL(vec dst, vec src, vec vtmp, rRegP scratch) %{
7941   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7942             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
7943   match(Set dst (VectorLoadShuffle src));
7944   effect(TEMP dst, TEMP vtmp, TEMP scratch);
7945   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7946   ins_encode %{
7947     assert(UseAVX >= 2, "required");
7948 
7949     int vlen_enc = vector_length_encoding(this);
7950     // Create a double word shuffle mask from long shuffle mask
7951     // only double word shuffle instruction available on these platforms
7952 
7953     // Multiply each shuffle by two to get double word index
7954     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
7955     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
7956 
7957     // Duplicate each double word shuffle
7958     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
7959     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
7960 
7961     // Add one to get alternate double word index
7962     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, $scratch$$Register);
7963   %}
7964   ins_pipe( pipe_slow );
7965 %}
7966 
7967 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
7968   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7969             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
7970   match(Set dst (VectorRearrange src shuffle));
7971   format %{ "vector_rearrange $dst, $shuffle, $src" %}
7972   ins_encode %{
7973     assert(UseAVX >= 2, "required");
7974 
7975     int vlen_enc = vector_length_encoding(this);
7976     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7977   %}
7978   ins_pipe( pipe_slow );
7979 %}
7980 
7981 instruct loadShuffleL_evex(vec dst, vec src) %{
7982   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7983             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
7984   match(Set dst (VectorLoadShuffle src));
7985   format %{ "vector_load_shuffle $dst, $src" %}
7986   ins_encode %{
7987     assert(UseAVX > 2, "required");
7988 
7989     int vlen_enc = vector_length_encoding(this);
7990     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7991   %}
7992   ins_pipe( pipe_slow );
7993 %}
7994 
7995 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
7996   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7997             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
7998   match(Set dst (VectorRearrange src shuffle));
7999   format %{ "vector_rearrange $dst, $shuffle, $src" %}
8000   ins_encode %{
8001     assert(UseAVX > 2, "required");
8002 
8003     int vlen_enc = vector_length_encoding(this);
8004     if (vlen_enc == Assembler::AVX_128bit) {
8005       vlen_enc = Assembler::AVX_256bit;
8006     }
8007     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
8008   %}
8009   ins_pipe( pipe_slow );
8010 %}
8011 
8012 // --------------------------------- FMA --------------------------------------
8013 // a * b + c
8014 
8015 instruct vfmaF_reg(vec a, vec b, vec c) %{
8016   match(Set c (FmaVF  c (Binary a b)));
8017   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8018   ins_cost(150);
8019   ins_encode %{
8020     assert(UseFMA, "not enabled");
8021     int vlen_enc = vector_length_encoding(this);
8022     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8023   %}
8024   ins_pipe( pipe_slow );
8025 %}
8026 
8027 instruct vfmaF_mem(vec a, memory b, vec c) %{
8028   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8029   match(Set c (FmaVF  c (Binary a (LoadVector b))));
8030   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8031   ins_cost(150);
8032   ins_encode %{
8033     assert(UseFMA, "not enabled");
8034     int vlen_enc = vector_length_encoding(this);
8035     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8036   %}
8037   ins_pipe( pipe_slow );
8038 %}
8039 
8040 instruct vfmaD_reg(vec a, vec b, vec c) %{
8041   match(Set c (FmaVD  c (Binary a b)));
8042   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8043   ins_cost(150);
8044   ins_encode %{
8045     assert(UseFMA, "not enabled");
8046     int vlen_enc = vector_length_encoding(this);
8047     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8048   %}
8049   ins_pipe( pipe_slow );
8050 %}
8051 
8052 instruct vfmaD_mem(vec a, memory b, vec c) %{
8053   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8054   match(Set c (FmaVD  c (Binary a (LoadVector b))));
8055   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8056   ins_cost(150);
8057   ins_encode %{
8058     assert(UseFMA, "not enabled");
8059     int vlen_enc = vector_length_encoding(this);
8060     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8061   %}
8062   ins_pipe( pipe_slow );
8063 %}
8064 
8065 // --------------------------------- Vector Multiply Add --------------------------------------
8066 
8067 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
8068   predicate(UseAVX == 0);
8069   match(Set dst (MulAddVS2VI dst src1));
8070   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
8071   ins_encode %{
8072     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
8073   %}
8074   ins_pipe( pipe_slow );
8075 %}
8076 
8077 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
8078   predicate(UseAVX > 0);
8079   match(Set dst (MulAddVS2VI src1 src2));
8080   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
8081   ins_encode %{
8082     int vlen_enc = vector_length_encoding(this);
8083     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8084   %}
8085   ins_pipe( pipe_slow );
8086 %}
8087 
8088 // --------------------------------- Vector Multiply Add Add ----------------------------------
8089 
8090 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
8091   predicate(VM_Version::supports_avx512_vnni());
8092   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
8093   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
8094   ins_encode %{
8095     assert(UseAVX > 2, "required");
8096     int vlen_enc = vector_length_encoding(this);
8097     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8098   %}
8099   ins_pipe( pipe_slow );
8100   ins_cost(10);
8101 %}
8102 
8103 // --------------------------------- PopCount --------------------------------------
8104 
8105 instruct vpopcountI(vec dst, vec src) %{
8106   match(Set dst (PopCountVI src));
8107   format %{ "vpopcntd  $dst,$src\t! vector popcount packedI" %}
8108   ins_encode %{
8109     assert(UsePopCountInstruction, "not enabled");
8110 
8111     int vlen_enc = vector_length_encoding(this);
8112     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8113   %}
8114   ins_pipe( pipe_slow );
8115 %}
8116 
8117 // --------------------------------- Bitwise Ternary Logic ----------------------------------
8118 
8119 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
8120   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
8121   effect(TEMP dst);
8122   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8123   ins_encode %{
8124     int vector_len = vector_length_encoding(this);
8125     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
8126   %}
8127   ins_pipe( pipe_slow );
8128 %}
8129 
8130 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
8131   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
8132   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
8133   effect(TEMP dst);
8134   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8135   ins_encode %{
8136     int vector_len = vector_length_encoding(this);
8137     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
8138   %}
8139   ins_pipe( pipe_slow );
8140 %}
8141 
8142 // --------------------------------- Rotation Operations ----------------------------------
8143 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
8144   match(Set dst (RotateLeftV src shift));
8145   match(Set dst (RotateRightV src shift));
8146   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
8147   ins_encode %{
8148     int opcode      = this->ideal_Opcode();
8149     int vector_len  = vector_length_encoding(this);
8150     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8151     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
8152   %}
8153   ins_pipe( pipe_slow );
8154 %}
8155 
8156 instruct vprorate(vec dst, vec src, vec shift) %{
8157   match(Set dst (RotateLeftV src shift));
8158   match(Set dst (RotateRightV src shift));
8159   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
8160   ins_encode %{
8161     int opcode      = this->ideal_Opcode();
8162     int vector_len  = vector_length_encoding(this);
8163     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8164     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8165   %}
8166   ins_pipe( pipe_slow );
8167 %}
8168 
8169 #ifdef _LP64
8170 // ---------------------------------- Masked Operations ------------------------------------
8171 
8172 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
8173   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
8174   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
8175   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
8176   ins_encode %{
8177     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
8178     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
8179 
8180     Label DONE;
8181     int vlen_enc = vector_length_encoding(this, $src1);
8182     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
8183 
8184     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
8185     __ mov64($dst$$Register, -1L);
8186     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
8187     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
8188     __ jccb(Assembler::carrySet, DONE);
8189     __ kmovql($dst$$Register, $ktmp1$$KRegister);
8190     __ notq($dst$$Register);
8191     __ tzcntq($dst$$Register, $dst$$Register);
8192     __ bind(DONE);
8193   %}
8194   ins_pipe( pipe_slow );
8195 %}
8196 
8197 
8198 instruct vmasked_load64(vec dst, memory mem, kReg mask) %{
8199   match(Set dst (LoadVectorMasked mem mask));
8200   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
8201   ins_encode %{
8202     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
8203     int vector_len = vector_length_encoding(this);
8204     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, vector_len);
8205   %}
8206   ins_pipe( pipe_slow );
8207 %}
8208 
8209 instruct vmask_gen(kReg dst, rRegL len, rRegL temp) %{
8210   match(Set dst (VectorMaskGen len));
8211   effect(TEMP temp);
8212   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
8213   ins_encode %{
8214     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
8215   %}
8216   ins_pipe( pipe_slow );
8217 %}
8218 
8219 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
8220   match(Set dst (VectorMaskGen len));
8221   format %{ "vector_mask_gen $len \t! vector mask generator" %}
8222   effect(TEMP temp);
8223   ins_encode %{
8224     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
8225     __ kmovql($dst$$KRegister, $temp$$Register);
8226   %}
8227   ins_pipe( pipe_slow );
8228 %}
8229 
8230 instruct vmasked_store64(memory mem, vec src, kReg mask) %{
8231   match(Set mem (StoreVectorMasked mem (Binary src mask)));
8232   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
8233   ins_encode %{
8234     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
8235     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
8236     int vector_len = vector_length_encoding(src_node);
8237     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, vector_len);
8238   %}
8239   ins_pipe( pipe_slow );
8240 %}
8241 
8242 instruct vmask_truecount_evex(rRegI dst, vec mask, rRegL tmp, kReg ktmp, vec xtmp) %{
8243   predicate(VM_Version::supports_avx512vlbw());
8244   match(Set dst (VectorMaskTrueCount mask));
8245   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp, TEMP xtmp);
8246   format %{ "vector_truecount_evex $mask \t! vector mask true count" %}
8247   ins_encode %{
8248     int opcode = this->ideal_Opcode();
8249     int vlen_enc = vector_length_encoding(this, $mask);
8250     int mask_len = Matcher::vector_length(this, $mask);
8251     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8252                              $tmp$$Register, $ktmp$$KRegister, mask_len, vlen_enc);
8253   %}
8254   ins_pipe( pipe_slow );
8255 %}
8256 
8257 instruct vmask_first_or_last_true_evex(rRegI dst, vec mask, rRegL tmp, kReg ktmp, vec xtmp, rFlagsReg cr) %{
8258   predicate(VM_Version::supports_avx512vlbw());
8259   match(Set dst (VectorMaskFirstTrue mask));
8260   match(Set dst (VectorMaskLastTrue mask));
8261   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp, TEMP xtmp, KILL cr);
8262   format %{ "vector_mask_first_or_last_true_evex $mask \t! vector first/last true location" %}
8263   ins_encode %{
8264     int opcode = this->ideal_Opcode();
8265     int vlen_enc = vector_length_encoding(this, $mask);
8266     int mask_len = Matcher::vector_length(this, $mask);
8267     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8268                              $tmp$$Register, $ktmp$$KRegister, mask_len, vlen_enc);
8269   %}
8270   ins_pipe( pipe_slow );
8271 %}
8272 
8273 instruct vmask_truecount_avx(rRegI dst, vec mask, rRegL tmp, vec xtmp, vec xtmp1) %{
8274   predicate(!VM_Version::supports_avx512vlbw());
8275   match(Set dst (VectorMaskTrueCount mask));
8276   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, TEMP xtmp1);
8277   format %{ "vector_truecount_avx $mask \t! vector mask true count" %}
8278   ins_encode %{
8279     int opcode = this->ideal_Opcode();
8280     int vlen_enc = vector_length_encoding(this, $mask);
8281     int mask_len = Matcher::vector_length(this, $mask);
8282     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8283                              $xtmp1$$XMMRegister, $tmp$$Register, mask_len, vlen_enc);
8284   %}
8285   ins_pipe( pipe_slow );
8286 %}
8287 
8288 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, rRegL tmp, vec xtmp, vec xtmp1, rFlagsReg cr) %{
8289   predicate(!VM_Version::supports_avx512vlbw());
8290   match(Set dst (VectorMaskFirstTrue mask));
8291   match(Set dst (VectorMaskLastTrue mask));
8292   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, TEMP xtmp1, KILL cr);
8293   format %{ "vector_mask_first_or_last_true_avx $mask \t! vector first/last true location" %}
8294   ins_encode %{
8295     int opcode = this->ideal_Opcode();
8296     int vlen_enc = vector_length_encoding(this, $mask);
8297     int mask_len = Matcher::vector_length(this, $mask);
8298     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8299                              $xtmp1$$XMMRegister, $tmp$$Register, mask_len, vlen_enc);
8300   %}
8301   ins_pipe( pipe_slow );
8302 %}
8303 #endif // _LP64
8304 
8305 instruct castVV(vec dst)
8306 %{
8307   match(Set dst (CastVV dst));
8308 
8309   size(0);
8310   format %{ "# castVV of $dst" %}
8311   ins_encode(/* empty encoding */);
8312   ins_cost(0);
8313   ins_pipe(empty);
8314 %}
8315 
8316 instruct castVVLeg(legVec dst)
8317 %{
8318   match(Set dst (CastVV dst));
8319 
8320   size(0);
8321   format %{ "# castVV of $dst" %}
8322   ins_encode(/* empty encoding */);
8323   ins_cost(0);
8324   ins_pipe(empty);
8325 %}