1 //
   2 // Copyright (c) 2011, 2021, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 // AVX3 Mask Registers.
 632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
 633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
 634 
 635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
 636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
 637 
 638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
 639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
 640 
 641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
 642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
 643 
 644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
 645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
 646 
 647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
 648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
 649 
 650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
 651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
 652 
 653 
 654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 662 #ifdef _LP64
 663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 687 #endif
 688                       );
 689 
 690 alloc_class chunk2(K7, K7_H,
 691                    K6, K6_H,
 692                    K5, K5_H,
 693                    K4, K4_H,
 694                    K3, K3_H,
 695                    K2, K2_H,
 696                    K1, K1_H);
 697 
 698 reg_class  vectmask_reg(K1, K1_H,
 699                         K2, K2_H,
 700                         K3, K3_H,
 701                         K4, K4_H,
 702                         K5, K5_H,
 703                         K6, K6_H,
 704                         K7, K7_H);
 705 
 706 reg_class vectmask_reg_K1(K1, K1_H);
 707 reg_class vectmask_reg_K2(K2, K2_H);
 708 reg_class vectmask_reg_K3(K3, K3_H);
 709 reg_class vectmask_reg_K4(K4, K4_H);
 710 reg_class vectmask_reg_K5(K5, K5_H);
 711 reg_class vectmask_reg_K6(K6, K6_H);
 712 reg_class vectmask_reg_K7(K7, K7_H);
 713 
 714 // flags allocation class should be last.
 715 alloc_class chunk3(RFLAGS);
 716 
 717 
 718 // Singleton class for condition codes
 719 reg_class int_flags(RFLAGS);
 720 
 721 // Class for pre evex float registers
 722 reg_class float_reg_legacy(XMM0,
 723                     XMM1,
 724                     XMM2,
 725                     XMM3,
 726                     XMM4,
 727                     XMM5,
 728                     XMM6,
 729                     XMM7
 730 #ifdef _LP64
 731                    ,XMM8,
 732                     XMM9,
 733                     XMM10,
 734                     XMM11,
 735                     XMM12,
 736                     XMM13,
 737                     XMM14,
 738                     XMM15
 739 #endif
 740                     );
 741 
 742 // Class for evex float registers
 743 reg_class float_reg_evex(XMM0,
 744                     XMM1,
 745                     XMM2,
 746                     XMM3,
 747                     XMM4,
 748                     XMM5,
 749                     XMM6,
 750                     XMM7
 751 #ifdef _LP64
 752                    ,XMM8,
 753                     XMM9,
 754                     XMM10,
 755                     XMM11,
 756                     XMM12,
 757                     XMM13,
 758                     XMM14,
 759                     XMM15,
 760                     XMM16,
 761                     XMM17,
 762                     XMM18,
 763                     XMM19,
 764                     XMM20,
 765                     XMM21,
 766                     XMM22,
 767                     XMM23,
 768                     XMM24,
 769                     XMM25,
 770                     XMM26,
 771                     XMM27,
 772                     XMM28,
 773                     XMM29,
 774                     XMM30,
 775                     XMM31
 776 #endif
 777                     );
 778 
 779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 781 
 782 // Class for pre evex double registers
 783 reg_class double_reg_legacy(XMM0,  XMM0b,
 784                      XMM1,  XMM1b,
 785                      XMM2,  XMM2b,
 786                      XMM3,  XMM3b,
 787                      XMM4,  XMM4b,
 788                      XMM5,  XMM5b,
 789                      XMM6,  XMM6b,
 790                      XMM7,  XMM7b
 791 #ifdef _LP64
 792                     ,XMM8,  XMM8b,
 793                      XMM9,  XMM9b,
 794                      XMM10, XMM10b,
 795                      XMM11, XMM11b,
 796                      XMM12, XMM12b,
 797                      XMM13, XMM13b,
 798                      XMM14, XMM14b,
 799                      XMM15, XMM15b
 800 #endif
 801                      );
 802 
 803 // Class for evex double registers
 804 reg_class double_reg_evex(XMM0,  XMM0b,
 805                      XMM1,  XMM1b,
 806                      XMM2,  XMM2b,
 807                      XMM3,  XMM3b,
 808                      XMM4,  XMM4b,
 809                      XMM5,  XMM5b,
 810                      XMM6,  XMM6b,
 811                      XMM7,  XMM7b
 812 #ifdef _LP64
 813                     ,XMM8,  XMM8b,
 814                      XMM9,  XMM9b,
 815                      XMM10, XMM10b,
 816                      XMM11, XMM11b,
 817                      XMM12, XMM12b,
 818                      XMM13, XMM13b,
 819                      XMM14, XMM14b,
 820                      XMM15, XMM15b,
 821                      XMM16, XMM16b,
 822                      XMM17, XMM17b,
 823                      XMM18, XMM18b,
 824                      XMM19, XMM19b,
 825                      XMM20, XMM20b,
 826                      XMM21, XMM21b,
 827                      XMM22, XMM22b,
 828                      XMM23, XMM23b,
 829                      XMM24, XMM24b,
 830                      XMM25, XMM25b,
 831                      XMM26, XMM26b,
 832                      XMM27, XMM27b,
 833                      XMM28, XMM28b,
 834                      XMM29, XMM29b,
 835                      XMM30, XMM30b,
 836                      XMM31, XMM31b
 837 #endif
 838                      );
 839 
 840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 842 
 843 // Class for pre evex 32bit vector registers
 844 reg_class vectors_reg_legacy(XMM0,
 845                       XMM1,
 846                       XMM2,
 847                       XMM3,
 848                       XMM4,
 849                       XMM5,
 850                       XMM6,
 851                       XMM7
 852 #ifdef _LP64
 853                      ,XMM8,
 854                       XMM9,
 855                       XMM10,
 856                       XMM11,
 857                       XMM12,
 858                       XMM13,
 859                       XMM14,
 860                       XMM15
 861 #endif
 862                       );
 863 
 864 // Class for evex 32bit vector registers
 865 reg_class vectors_reg_evex(XMM0,
 866                       XMM1,
 867                       XMM2,
 868                       XMM3,
 869                       XMM4,
 870                       XMM5,
 871                       XMM6,
 872                       XMM7
 873 #ifdef _LP64
 874                      ,XMM8,
 875                       XMM9,
 876                       XMM10,
 877                       XMM11,
 878                       XMM12,
 879                       XMM13,
 880                       XMM14,
 881                       XMM15,
 882                       XMM16,
 883                       XMM17,
 884                       XMM18,
 885                       XMM19,
 886                       XMM20,
 887                       XMM21,
 888                       XMM22,
 889                       XMM23,
 890                       XMM24,
 891                       XMM25,
 892                       XMM26,
 893                       XMM27,
 894                       XMM28,
 895                       XMM29,
 896                       XMM30,
 897                       XMM31
 898 #endif
 899                       );
 900 
 901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 903 
 904 // Class for all 64bit vector registers
 905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 906                       XMM1,  XMM1b,
 907                       XMM2,  XMM2b,
 908                       XMM3,  XMM3b,
 909                       XMM4,  XMM4b,
 910                       XMM5,  XMM5b,
 911                       XMM6,  XMM6b,
 912                       XMM7,  XMM7b
 913 #ifdef _LP64
 914                      ,XMM8,  XMM8b,
 915                       XMM9,  XMM9b,
 916                       XMM10, XMM10b,
 917                       XMM11, XMM11b,
 918                       XMM12, XMM12b,
 919                       XMM13, XMM13b,
 920                       XMM14, XMM14b,
 921                       XMM15, XMM15b
 922 #endif
 923                       );
 924 
 925 // Class for all 64bit vector registers
 926 reg_class vectord_reg_evex(XMM0,  XMM0b,
 927                       XMM1,  XMM1b,
 928                       XMM2,  XMM2b,
 929                       XMM3,  XMM3b,
 930                       XMM4,  XMM4b,
 931                       XMM5,  XMM5b,
 932                       XMM6,  XMM6b,
 933                       XMM7,  XMM7b
 934 #ifdef _LP64
 935                      ,XMM8,  XMM8b,
 936                       XMM9,  XMM9b,
 937                       XMM10, XMM10b,
 938                       XMM11, XMM11b,
 939                       XMM12, XMM12b,
 940                       XMM13, XMM13b,
 941                       XMM14, XMM14b,
 942                       XMM15, XMM15b,
 943                       XMM16, XMM16b,
 944                       XMM17, XMM17b,
 945                       XMM18, XMM18b,
 946                       XMM19, XMM19b,
 947                       XMM20, XMM20b,
 948                       XMM21, XMM21b,
 949                       XMM22, XMM22b,
 950                       XMM23, XMM23b,
 951                       XMM24, XMM24b,
 952                       XMM25, XMM25b,
 953                       XMM26, XMM26b,
 954                       XMM27, XMM27b,
 955                       XMM28, XMM28b,
 956                       XMM29, XMM29b,
 957                       XMM30, XMM30b,
 958                       XMM31, XMM31b
 959 #endif
 960                       );
 961 
 962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 964 
 965 // Class for all 128bit vector registers
 966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 973                       XMM7,  XMM7b,  XMM7c,  XMM7d
 974 #ifdef _LP64
 975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 977                       XMM10, XMM10b, XMM10c, XMM10d,
 978                       XMM11, XMM11b, XMM11c, XMM11d,
 979                       XMM12, XMM12b, XMM12c, XMM12d,
 980                       XMM13, XMM13b, XMM13c, XMM13d,
 981                       XMM14, XMM14b, XMM14c, XMM14d,
 982                       XMM15, XMM15b, XMM15c, XMM15d
 983 #endif
 984                       );
 985 
 986 // Class for all 128bit vector registers
 987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 994                       XMM7,  XMM7b,  XMM7c,  XMM7d
 995 #ifdef _LP64
 996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 998                       XMM10, XMM10b, XMM10c, XMM10d,
 999                       XMM11, XMM11b, XMM11c, XMM11d,
1000                       XMM12, XMM12b, XMM12c, XMM12d,
1001                       XMM13, XMM13b, XMM13c, XMM13d,
1002                       XMM14, XMM14b, XMM14c, XMM14d,
1003                       XMM15, XMM15b, XMM15c, XMM15d,
1004                       XMM16, XMM16b, XMM16c, XMM16d,
1005                       XMM17, XMM17b, XMM17c, XMM17d,
1006                       XMM18, XMM18b, XMM18c, XMM18d,
1007                       XMM19, XMM19b, XMM19c, XMM19d,
1008                       XMM20, XMM20b, XMM20c, XMM20d,
1009                       XMM21, XMM21b, XMM21c, XMM21d,
1010                       XMM22, XMM22b, XMM22c, XMM22d,
1011                       XMM23, XMM23b, XMM23c, XMM23d,
1012                       XMM24, XMM24b, XMM24c, XMM24d,
1013                       XMM25, XMM25b, XMM25c, XMM25d,
1014                       XMM26, XMM26b, XMM26c, XMM26d,
1015                       XMM27, XMM27b, XMM27c, XMM27d,
1016                       XMM28, XMM28b, XMM28c, XMM28d,
1017                       XMM29, XMM29b, XMM29c, XMM29d,
1018                       XMM30, XMM30b, XMM30c, XMM30d,
1019                       XMM31, XMM31b, XMM31c, XMM31d
1020 #endif
1021                       );
1022 
1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1025 
1026 // Class for all 256bit vector registers
1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1035 #ifdef _LP64
1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
1044 #endif
1045                       );
1046 
1047 // Class for all 256bit vector registers
1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1056 #ifdef _LP64
1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1081 #endif
1082                       );
1083 
1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1086 
1087 // Class for all 512bit vector registers
1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1096 #ifdef _LP64
1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1121 #endif
1122                       );
1123 
1124 // Class for restricted 512bit vector registers
1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1133 #ifdef _LP64
1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1142 #endif
1143                       );
1144 
1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1147 
1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1149 %}
1150 
1151 
1152 //----------SOURCE BLOCK-------------------------------------------------------
1153 // This is a block of C++ code which provides values, functions, and
1154 // definitions necessary in the rest of the architecture description
1155 
1156 source_hpp %{
1157 // Header information of the source block.
1158 // Method declarations/definitions which are used outside
1159 // the ad-scope can conveniently be defined here.
1160 //
1161 // To keep related declarations/definitions/uses close together,
1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1163 
1164 #include "runtime/vm_version.hpp"
1165 
1166 class NativeJump;
1167 
1168 class CallStubImpl {
1169 
1170   //--------------------------------------------------------------
1171   //---<  Used for optimization in Compile::shorten_branches  >---
1172   //--------------------------------------------------------------
1173 
1174  public:
1175   // Size of call trampoline stub.
1176   static uint size_call_trampoline() {
1177     return 0; // no call trampolines on this platform
1178   }
1179 
1180   // number of relocations needed by a call trampoline stub
1181   static uint reloc_call_trampoline() {
1182     return 0; // no call trampolines on this platform
1183   }
1184 };
1185 
1186 class HandlerImpl {
1187 
1188  public:
1189 
1190   static int emit_exception_handler(CodeBuffer &cbuf);
1191   static int emit_deopt_handler(CodeBuffer& cbuf);
1192 
1193   static uint size_exception_handler() {
1194     // NativeCall instruction size is the same as NativeJump.
1195     // exception handler starts out as jump and can be patched to
1196     // a call be deoptimization.  (4932387)
1197     // Note that this value is also credited (in output.cpp) to
1198     // the size of the code section.
1199     return NativeJump::instruction_size;
1200   }
1201 
1202 #ifdef _LP64
1203   static uint size_deopt_handler() {
1204     // three 5 byte instructions plus one move for unreachable address.
1205     return 15+3;
1206   }
1207 #else
1208   static uint size_deopt_handler() {
1209     // NativeCall instruction size is the same as NativeJump.
1210     // exception handler starts out as jump and can be patched to
1211     // a call be deoptimization.  (4932387)
1212     // Note that this value is also credited (in output.cpp) to
1213     // the size of the code section.
1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1215   }
1216 #endif
1217 };
1218 
1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
1220   switch(bytes) {
1221     case  4: // fall-through
1222     case  8: // fall-through
1223     case 16: return Assembler::AVX_128bit;
1224     case 32: return Assembler::AVX_256bit;
1225     case 64: return Assembler::AVX_512bit;
1226 
1227     default: {
1228       ShouldNotReachHere();
1229       return Assembler::AVX_NoVec;
1230     }
1231   }
1232 }
1233 
1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
1236 }
1237 
1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
1239   uint def_idx = use->operand_index(opnd);
1240   Node* def = use->in(def_idx);
1241   return vector_length_encoding(def);
1242 }
1243 
1244 static inline bool is_unsigned_booltest_pred(int bt) {
1245   return  ((bt & BoolTest::unsigned_compare) == BoolTest::unsigned_compare);
1246 }
1247 
1248 class Node::PD {
1249 public:
1250   enum NodeFlags {
1251     Flag_intel_jcc_erratum = Node::_last_flag << 1,
1252     _last_flag             = Flag_intel_jcc_erratum
1253   };
1254 };
1255 
1256 %} // end source_hpp
1257 
1258 source %{
1259 
1260 #include "opto/addnode.hpp"
1261 #include "c2_intelJccErratum_x86.hpp"
1262 
1263 void PhaseOutput::pd_perform_mach_node_analysis() {
1264   if (VM_Version::has_intel_jcc_erratum()) {
1265     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
1266     _buf_sizes._code += extra_padding;
1267   }
1268 }
1269 
1270 int MachNode::pd_alignment_required() const {
1271   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
1272     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
1273     return IntelJccErratum::largest_jcc_size() + 1;
1274   } else {
1275     return 1;
1276   }
1277 }
1278 
1279 int MachNode::compute_padding(int current_offset) const {
1280   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
1281     Compile* C = Compile::current();
1282     PhaseOutput* output = C->output();
1283     Block* block = output->block();
1284     int index = output->index();
1285     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
1286   } else {
1287     return 0;
1288   }
1289 }
1290 
1291 // Emit exception handler code.
1292 // Stuff framesize into a register and call a VM stub routine.
1293 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1294 
1295   // Note that the code buffer's insts_mark is always relative to insts.
1296   // That's why we must use the macroassembler to generate a handler.
1297   C2_MacroAssembler _masm(&cbuf);
1298   address base = __ start_a_stub(size_exception_handler());
1299   if (base == NULL) {
1300     ciEnv::current()->record_failure("CodeCache is full");
1301     return 0;  // CodeBuffer::expand failed
1302   }
1303   int offset = __ offset();
1304   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1305   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1306   __ end_a_stub();
1307   return offset;
1308 }
1309 
1310 // Emit deopt handler code.
1311 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1312 
1313   // Note that the code buffer's insts_mark is always relative to insts.
1314   // That's why we must use the macroassembler to generate a handler.
1315   C2_MacroAssembler _masm(&cbuf);
1316   address base = __ start_a_stub(size_deopt_handler());
1317   if (base == NULL) {
1318     ciEnv::current()->record_failure("CodeCache is full");
1319     return 0;  // CodeBuffer::expand failed
1320   }
1321   int offset = __ offset();
1322 
1323 #ifdef _LP64
1324   address the_pc = (address) __ pc();
1325   Label next;
1326   // push a "the_pc" on the stack without destroying any registers
1327   // as they all may be live.
1328 
1329   // push address of "next"
1330   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1331   __ bind(next);
1332   // adjust it so it matches "the_pc"
1333   __ subptr(Address(rsp, 0), __ offset() - offset);
1334 #else
1335   InternalAddress here(__ pc());
1336   __ pushptr(here.addr());
1337 #endif
1338 
1339   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1340   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1341   __ end_a_stub();
1342   return offset;
1343 }
1344 
1345 Assembler::Width widthForType(BasicType bt) {
1346   if (bt == T_BYTE) {
1347     return Assembler::B;
1348   } else if (bt == T_SHORT) {
1349     return Assembler::W;
1350   } else if (bt == T_INT) {
1351     return Assembler::D;
1352   } else {
1353     assert(bt == T_LONG, "not a long: %s", type2name(bt));
1354     return Assembler::Q;
1355   }
1356 }
1357 
1358 //=============================================================================
1359 
1360   // Float masks come from different places depending on platform.
1361 #ifdef _LP64
1362   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1363   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1364   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1365   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1366 #else
1367   static address float_signmask()  { return (address)float_signmask_pool; }
1368   static address float_signflip()  { return (address)float_signflip_pool; }
1369   static address double_signmask() { return (address)double_signmask_pool; }
1370   static address double_signflip() { return (address)double_signflip_pool; }
1371 #endif
1372   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1373   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1374   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1375   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1376   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1377   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1378   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
1379   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1380   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1381   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1382   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1383   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1384 
1385 //=============================================================================
1386 const bool Matcher::match_rule_supported(int opcode) {
1387   if (!has_match_rule(opcode)) {
1388     return false; // no match rule present
1389   }
1390   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1391   switch (opcode) {
1392     case Op_AbsVL:
1393     case Op_StoreVectorScatter:
1394       if (UseAVX < 3) {
1395         return false;
1396       }
1397       break;
1398     case Op_PopCountI:
1399     case Op_PopCountL:
1400       if (!UsePopCountInstruction) {
1401         return false;
1402       }
1403       break;
1404     case Op_PopCountVI:
1405       if (!UsePopCountInstruction || !VM_Version::supports_avx512_vpopcntdq()) {
1406         return false;
1407       }
1408       break;
1409     case Op_MulVI:
1410       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
1411         return false;
1412       }
1413       break;
1414     case Op_MulVL:
1415       if (UseSSE < 4) { // only with SSE4_1 or AVX
1416         return false;
1417       }
1418       break;
1419     case Op_MulReductionVL:
1420       if (VM_Version::supports_avx512dq() == false) {
1421         return false;
1422       }
1423       break;
1424     case Op_AddReductionVL:
1425       if (UseSSE < 2) { // requires at least SSE2
1426         return false;
1427       }
1428       break;
1429     case Op_AbsVB:
1430     case Op_AbsVS:
1431     case Op_AbsVI:
1432     case Op_AddReductionVI:
1433     case Op_AndReductionV:
1434     case Op_OrReductionV:
1435     case Op_XorReductionV:
1436       if (UseSSE < 3) { // requires at least SSSE3
1437         return false;
1438       }
1439       break;
1440     case Op_VectorLoadShuffle:
1441     case Op_VectorRearrange:
1442     case Op_MulReductionVI:
1443       if (UseSSE < 4) { // requires at least SSE4
1444         return false;
1445       }
1446       break;
1447     case Op_SqrtVD:
1448     case Op_SqrtVF:
1449     case Op_VectorMaskCmp:
1450     case Op_VectorCastB2X:
1451     case Op_VectorCastS2X:
1452     case Op_VectorCastI2X:
1453     case Op_VectorCastL2X:
1454     case Op_VectorCastF2X:
1455     case Op_VectorCastD2X:
1456       if (UseAVX < 1) { // enabled for AVX only
1457         return false;
1458       }
1459       break;
1460     case Op_CompareAndSwapL:
1461 #ifdef _LP64
1462     case Op_CompareAndSwapP:
1463 #endif
1464       if (!VM_Version::supports_cx8()) {
1465         return false;
1466       }
1467       break;
1468     case Op_CMoveVF:
1469     case Op_CMoveVD:
1470       if (UseAVX < 1) { // enabled for AVX only
1471         return false;
1472       }
1473       break;
1474     case Op_StrIndexOf:
1475       if (!UseSSE42Intrinsics) {
1476         return false;
1477       }
1478       break;
1479     case Op_StrIndexOfChar:
1480       if (!UseSSE42Intrinsics) {
1481         return false;
1482       }
1483       break;
1484     case Op_OnSpinWait:
1485       if (VM_Version::supports_on_spin_wait() == false) {
1486         return false;
1487       }
1488       break;
1489     case Op_MulVB:
1490     case Op_LShiftVB:
1491     case Op_RShiftVB:
1492     case Op_URShiftVB:
1493     case Op_VectorInsert:
1494     case Op_VectorLoadMask:
1495     case Op_VectorStoreMask:
1496     case Op_VectorBlend:
1497       if (UseSSE < 4) {
1498         return false;
1499       }
1500       break;
1501 #ifdef _LP64
1502     case Op_MaxD:
1503     case Op_MaxF:
1504     case Op_MinD:
1505     case Op_MinF:
1506       if (UseAVX < 1) { // enabled for AVX only
1507         return false;
1508       }
1509       break;
1510 #endif
1511     case Op_CacheWB:
1512     case Op_CacheWBPreSync:
1513     case Op_CacheWBPostSync:
1514       if (!VM_Version::supports_data_cache_line_flush()) {
1515         return false;
1516       }
1517       break;
1518     case Op_ExtractB:
1519     case Op_ExtractL:
1520     case Op_ExtractI:
1521     case Op_RoundDoubleMode:
1522       if (UseSSE < 4) {
1523         return false;
1524       }
1525       break;
1526     case Op_RoundDoubleModeV:
1527       if (VM_Version::supports_avx() == false) {
1528         return false; // 128bit vroundpd is not available
1529       }
1530       break;
1531     case Op_LoadVectorGather:
1532       if (UseAVX < 2) {
1533         return false;
1534       }
1535       break;
1536     case Op_FmaVD:
1537     case Op_FmaVF:
1538       if (!UseFMA) {
1539         return false;
1540       }
1541       break;
1542     case Op_MacroLogicV:
1543       if (UseAVX < 3 || !UseVectorMacroLogic) {
1544         return false;
1545       }
1546       break;
1547 
1548     case Op_VectorCmpMasked:
1549     case Op_VectorMaskGen:
1550     case Op_LoadVectorMasked:
1551     case Op_StoreVectorMasked:
1552       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
1553         return false;
1554       }
1555       break;
1556     case Op_VectorMaskFirstTrue:
1557     case Op_VectorMaskLastTrue:
1558     case Op_VectorMaskTrueCount:
1559       if (!is_LP64 || UseAVX < 1) {
1560          return false;
1561       }
1562       break;
1563     case Op_CopySignD:
1564     case Op_CopySignF:
1565       if (UseAVX < 3 || !is_LP64)  {
1566         return false;
1567       }
1568       if (!VM_Version::supports_avx512vl()) {
1569         return false;
1570       }
1571       break;
1572 #ifndef _LP64
1573     case Op_AddReductionVF:
1574     case Op_AddReductionVD:
1575     case Op_MulReductionVF:
1576     case Op_MulReductionVD:
1577       if (UseSSE < 1) { // requires at least SSE
1578         return false;
1579       }
1580       break;
1581     case Op_MulAddVS2VI:
1582     case Op_RShiftVL:
1583     case Op_AbsVD:
1584     case Op_NegVD:
1585       if (UseSSE < 2) {
1586         return false;
1587       }
1588       break;
1589 #endif // !LP64
1590     case Op_SignumF:
1591       if (UseSSE < 1) {
1592         return false;
1593       }
1594       break;
1595     case Op_SignumD:
1596       if (UseSSE < 2) {
1597         return false;
1598       }
1599       break;
1600   }
1601   return true;  // Match rules are supported by default.
1602 }
1603 
1604 //------------------------------------------------------------------------
1605 
1606 // Identify extra cases that we might want to provide match rules for vector nodes and
1607 // other intrinsics guarded with vector length (vlen) and element type (bt).
1608 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
1609   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1610   if (!match_rule_supported(opcode)) {
1611     return false;
1612   }
1613   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
1614   //   * SSE2 supports 128bit vectors for all types;
1615   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
1616   //   * AVX2 supports 256bit vectors for all types;
1617   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
1618   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1619   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1620   // And MaxVectorSize is taken into account as well.
1621   if (!vector_size_supported(bt, vlen)) {
1622     return false;
1623   }
1624   // Special cases which require vector length follow:
1625   //   * implementation limitations
1626   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1627   //   * 128bit vroundpd instruction is present only in AVX1
1628   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1629   switch (opcode) {
1630     case Op_AbsVF:
1631     case Op_NegVF:
1632       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1633         return false; // 512bit vandps and vxorps are not available
1634       }
1635       break;
1636     case Op_AbsVD:
1637     case Op_NegVD:
1638     case Op_MulVL:
1639       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1640         return false; // 512bit vpmullq, vandpd and vxorpd are not available
1641       }
1642       break;
1643     case Op_CMoveVF:
1644       if (vlen != 8) {
1645         return false; // implementation limitation (only vcmov8F_reg is present)
1646       }
1647       break;
1648     case Op_RotateRightV:
1649     case Op_RotateLeftV:
1650       if (bt != T_INT && bt != T_LONG) {
1651         return false;
1652       } // fallthrough
1653     case Op_MacroLogicV:
1654       if (!VM_Version::supports_evex() ||
1655           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
1656         return false;
1657       }
1658       break;
1659     case Op_ClearArray:
1660     case Op_VectorMaskGen:
1661     case Op_VectorCmpMasked:
1662     case Op_LoadVectorMasked:
1663     case Op_StoreVectorMasked:
1664       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
1665         return false;
1666       }
1667       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
1668         return false;
1669       }
1670       break;
1671     case Op_CMoveVD:
1672       if (vlen != 4) {
1673         return false; // implementation limitation (only vcmov4D_reg is present)
1674       }
1675       break;
1676     case Op_MaxV:
1677     case Op_MinV:
1678       if (UseSSE < 4 && is_integral_type(bt)) {
1679         return false;
1680       }
1681       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
1682           // Float/Double intrinsics are enabled for AVX family currently.
1683           if (UseAVX == 0) {
1684             return false;
1685           }
1686           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
1687             return false;
1688           }
1689       }
1690       break;
1691     case Op_CallLeafVector:
1692       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
1693         return false;
1694       }
1695       break;
1696     case Op_AddReductionVI:
1697       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
1698         return false;
1699       }
1700       // fallthrough
1701     case Op_AndReductionV:
1702     case Op_OrReductionV:
1703     case Op_XorReductionV:
1704       if (is_subword_type(bt) && (UseSSE < 4)) {
1705         return false;
1706       }
1707 #ifndef _LP64
1708       if (bt == T_BYTE || bt == T_LONG) {
1709         return false;
1710       }
1711 #endif
1712       break;
1713 #ifndef _LP64
1714     case Op_VectorInsert:
1715       if (bt == T_LONG || bt == T_DOUBLE) {
1716         return false;
1717       }
1718       break;
1719 #endif
1720     case Op_MinReductionV:
1721     case Op_MaxReductionV:
1722       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
1723         return false;
1724       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
1725         return false;
1726       }
1727       // Float/Double intrinsics enabled for AVX family.
1728       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
1729         return false;
1730       }
1731       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
1732         return false;
1733       }
1734 #ifndef _LP64
1735       if (bt == T_BYTE || bt == T_LONG) {
1736         return false;
1737       }
1738 #endif
1739       break;
1740     case Op_VectorTest:
1741       if (UseSSE < 4) {
1742         return false; // Implementation limitation
1743       } else if (size_in_bits < 32) {
1744         return false; // Implementation limitation
1745       } else if (size_in_bits == 512 && (VM_Version::supports_avx512bw() == false)) {
1746         return false; // Implementation limitation
1747       }
1748       break;
1749     case Op_VectorLoadShuffle:
1750     case Op_VectorRearrange:
1751       if(vlen == 2) {
1752         return false; // Implementation limitation due to how shuffle is loaded
1753       } else if (size_in_bits == 256 && UseAVX < 2) {
1754         return false; // Implementation limitation
1755       } else if (bt == T_BYTE && size_in_bits > 256 && !VM_Version::supports_avx512_vbmi())  {
1756         return false; // Implementation limitation
1757       } else if (bt == T_SHORT && size_in_bits > 256 && !VM_Version::supports_avx512bw())  {
1758         return false; // Implementation limitation
1759       }
1760       break;
1761     case Op_VectorLoadMask:
1762       if (size_in_bits == 256 && UseAVX < 2) {
1763         return false; // Implementation limitation
1764       }
1765       // fallthrough
1766     case Op_VectorStoreMask:
1767       if (vlen == 2) {
1768         return false; // Implementation limitation
1769       }
1770       break;
1771     case Op_VectorCastB2X:
1772       if (size_in_bits == 256 && UseAVX < 2) {
1773         return false; // Implementation limitation
1774       }
1775       break;
1776     case Op_VectorCastS2X:
1777       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1778         return false;
1779       }
1780       break;
1781     case Op_VectorCastI2X:
1782       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1783         return false;
1784       }
1785       break;
1786     case Op_VectorCastL2X:
1787       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1788         return false;
1789       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
1790         return false;
1791       }
1792       break;
1793     case Op_VectorCastF2X:
1794     case Op_VectorCastD2X:
1795       if (is_integral_type(bt)) {
1796         // Casts from FP to integral types require special fixup logic not easily
1797         // implementable with vectors.
1798         return false; // Implementation limitation
1799       }
1800     case Op_MulReductionVI:
1801       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
1802         return false;
1803       }
1804       break;
1805     case Op_StoreVectorScatter:
1806       if(bt == T_BYTE || bt == T_SHORT) {
1807         return false;
1808       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1809         return false;
1810       }
1811       // fallthrough
1812     case Op_LoadVectorGather:
1813       if (size_in_bits == 64 ) {
1814         return false;
1815       }
1816       break;
1817     case Op_VectorMaskCmp:
1818       if (vlen < 2 || size_in_bits < 32) {
1819         return false;
1820       }
1821       break;
1822   }
1823   return true;  // Per default match rules are supported.
1824 }
1825 
1826 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
1827   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
1828   bool legacy = (generic_opnd->opcode() == LEGVEC);
1829   if (!VM_Version::supports_avx512vlbwdq() && // KNL
1830       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
1831     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
1832     return new legVecZOper();
1833   }
1834   if (legacy) {
1835     switch (ideal_reg) {
1836       case Op_VecS: return new legVecSOper();
1837       case Op_VecD: return new legVecDOper();
1838       case Op_VecX: return new legVecXOper();
1839       case Op_VecY: return new legVecYOper();
1840       case Op_VecZ: return new legVecZOper();
1841     }
1842   } else {
1843     switch (ideal_reg) {
1844       case Op_VecS: return new vecSOper();
1845       case Op_VecD: return new vecDOper();
1846       case Op_VecX: return new vecXOper();
1847       case Op_VecY: return new vecYOper();
1848       case Op_VecZ: return new vecZOper();
1849     }
1850   }
1851   ShouldNotReachHere();
1852   return NULL;
1853 }
1854 
1855 bool Matcher::is_reg2reg_move(MachNode* m) {
1856   switch (m->rule()) {
1857     case MoveVec2Leg_rule:
1858     case MoveLeg2Vec_rule:
1859     case MoveF2VL_rule:
1860     case MoveF2LEG_rule:
1861     case MoveVL2F_rule:
1862     case MoveLEG2F_rule:
1863     case MoveD2VL_rule:
1864     case MoveD2LEG_rule:
1865     case MoveVL2D_rule:
1866     case MoveLEG2D_rule:
1867       return true;
1868     default:
1869       return false;
1870   }
1871 }
1872 
1873 bool Matcher::is_generic_vector(MachOper* opnd) {
1874   switch (opnd->opcode()) {
1875     case VEC:
1876     case LEGVEC:
1877       return true;
1878     default:
1879       return false;
1880   }
1881 }
1882 
1883 //------------------------------------------------------------------------
1884 
1885 const RegMask* Matcher::predicate_reg_mask(void) {
1886   return &_VECTMASK_REG_mask;
1887 }
1888 
1889 const TypeVect* Matcher::predicate_reg_type(const Type* elemTy, int length) {
1890   return new TypeVectMask(TypeInt::BOOL, length);
1891 }
1892 
1893 // Max vector size in bytes. 0 if not supported.
1894 const int Matcher::vector_width_in_bytes(BasicType bt) {
1895   assert(is_java_primitive(bt), "only primitive type vectors");
1896   if (UseSSE < 2) return 0;
1897   // SSE2 supports 128bit vectors for all types.
1898   // AVX2 supports 256bit vectors for all types.
1899   // AVX2/EVEX supports 512bit vectors for all types.
1900   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1901   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1902   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1903     size = (UseAVX > 2) ? 64 : 32;
1904   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
1905     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
1906   // Use flag to limit vector size.
1907   size = MIN2(size,(int)MaxVectorSize);
1908   // Minimum 2 values in vector (or 4 for bytes).
1909   switch (bt) {
1910   case T_DOUBLE:
1911   case T_LONG:
1912     if (size < 16) return 0;
1913     break;
1914   case T_FLOAT:
1915   case T_INT:
1916     if (size < 8) return 0;
1917     break;
1918   case T_BOOLEAN:
1919     if (size < 4) return 0;
1920     break;
1921   case T_CHAR:
1922     if (size < 4) return 0;
1923     break;
1924   case T_BYTE:
1925     if (size < 4) return 0;
1926     break;
1927   case T_SHORT:
1928     if (size < 4) return 0;
1929     break;
1930   default:
1931     ShouldNotReachHere();
1932   }
1933   return size;
1934 }
1935 
1936 // Limits on vector size (number of elements) loaded into vector.
1937 const int Matcher::max_vector_size(const BasicType bt) {
1938   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1939 }
1940 const int Matcher::min_vector_size(const BasicType bt) {
1941   int max_size = max_vector_size(bt);
1942   // Min size which can be loaded into vector is 4 bytes.
1943   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1944   // Support for calling svml double64 vectors
1945   if (bt == T_DOUBLE) {
1946     size = 1;
1947   }
1948   return MIN2(size,max_size);
1949 }
1950 
1951 const int Matcher::scalable_vector_reg_size(const BasicType bt) {
1952   return -1;
1953 }
1954 
1955 // Vector ideal reg corresponding to specified size in bytes
1956 const uint Matcher::vector_ideal_reg(int size) {
1957   assert(MaxVectorSize >= size, "");
1958   switch(size) {
1959     case  4: return Op_VecS;
1960     case  8: return Op_VecD;
1961     case 16: return Op_VecX;
1962     case 32: return Op_VecY;
1963     case 64: return Op_VecZ;
1964   }
1965   ShouldNotReachHere();
1966   return 0;
1967 }
1968 
1969 // Check for shift by small constant as well
1970 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1971   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1972       shift->in(2)->get_int() <= 3 &&
1973       // Are there other uses besides address expressions?
1974       !matcher->is_visited(shift)) {
1975     address_visited.set(shift->_idx); // Flag as address_visited
1976     mstack.push(shift->in(2), Matcher::Visit);
1977     Node *conv = shift->in(1);
1978 #ifdef _LP64
1979     // Allow Matcher to match the rule which bypass
1980     // ConvI2L operation for an array index on LP64
1981     // if the index value is positive.
1982     if (conv->Opcode() == Op_ConvI2L &&
1983         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1984         // Are there other uses besides address expressions?
1985         !matcher->is_visited(conv)) {
1986       address_visited.set(conv->_idx); // Flag as address_visited
1987       mstack.push(conv->in(1), Matcher::Pre_Visit);
1988     } else
1989 #endif
1990       mstack.push(conv, Matcher::Pre_Visit);
1991     return true;
1992   }
1993   return false;
1994 }
1995 
1996 // This function identifies sub-graphs in which a 'load' node is
1997 // input to two different nodes, and such that it can be matched
1998 // with BMI instructions like blsi, blsr, etc.
1999 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
2000 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
2001 // refers to the same node.
2002 //
2003 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
2004 // This is a temporary solution until we make DAGs expressible in ADL.
2005 template<typename ConType>
2006 class FusedPatternMatcher {
2007   Node* _op1_node;
2008   Node* _mop_node;
2009   int _con_op;
2010 
2011   static int match_next(Node* n, int next_op, int next_op_idx) {
2012     if (n->in(1) == NULL || n->in(2) == NULL) {
2013       return -1;
2014     }
2015 
2016     if (next_op_idx == -1) { // n is commutative, try rotations
2017       if (n->in(1)->Opcode() == next_op) {
2018         return 1;
2019       } else if (n->in(2)->Opcode() == next_op) {
2020         return 2;
2021       }
2022     } else {
2023       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
2024       if (n->in(next_op_idx)->Opcode() == next_op) {
2025         return next_op_idx;
2026       }
2027     }
2028     return -1;
2029   }
2030 
2031  public:
2032   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
2033     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
2034 
2035   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
2036              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
2037              typename ConType::NativeType con_value) {
2038     if (_op1_node->Opcode() != op1) {
2039       return false;
2040     }
2041     if (_mop_node->outcnt() > 2) {
2042       return false;
2043     }
2044     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
2045     if (op1_op2_idx == -1) {
2046       return false;
2047     }
2048     // Memory operation must be the other edge
2049     int op1_mop_idx = (op1_op2_idx & 1) + 1;
2050 
2051     // Check that the mop node is really what we want
2052     if (_op1_node->in(op1_mop_idx) == _mop_node) {
2053       Node* op2_node = _op1_node->in(op1_op2_idx);
2054       if (op2_node->outcnt() > 1) {
2055         return false;
2056       }
2057       assert(op2_node->Opcode() == op2, "Should be");
2058       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
2059       if (op2_con_idx == -1) {
2060         return false;
2061       }
2062       // Memory operation must be the other edge
2063       int op2_mop_idx = (op2_con_idx & 1) + 1;
2064       // Check that the memory operation is the same node
2065       if (op2_node->in(op2_mop_idx) == _mop_node) {
2066         // Now check the constant
2067         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
2068         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
2069           return true;
2070         }
2071       }
2072     }
2073     return false;
2074   }
2075 };
2076 
2077 static bool is_bmi_pattern(Node* n, Node* m) {
2078   assert(UseBMI1Instructions, "sanity");
2079   if (n != NULL && m != NULL) {
2080     if (m->Opcode() == Op_LoadI) {
2081       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
2082       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
2083              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
2084              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
2085     } else if (m->Opcode() == Op_LoadL) {
2086       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
2087       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
2088              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
2089              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
2090     }
2091   }
2092   return false;
2093 }
2094 
2095 // Should the matcher clone input 'm' of node 'n'?
2096 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
2097   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
2098   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
2099     mstack.push(m, Visit);
2100     return true;
2101   }
2102   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
2103     mstack.push(m, Visit);           // m = ShiftCntV
2104     return true;
2105   }
2106   return false;
2107 }
2108 
2109 // Should the Matcher clone shifts on addressing modes, expecting them
2110 // to be subsumed into complex addressing expressions or compute them
2111 // into registers?
2112 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
2113   Node *off = m->in(AddPNode::Offset);
2114   if (off->is_Con()) {
2115     address_visited.test_set(m->_idx); // Flag as address_visited
2116     Node *adr = m->in(AddPNode::Address);
2117 
2118     // Intel can handle 2 adds in addressing mode
2119     // AtomicAdd is not an addressing expression.
2120     // Cheap to find it by looking for screwy base.
2121     if (adr->is_AddP() &&
2122         !adr->in(AddPNode::Base)->is_top() &&
2123         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
2124         // Are there other uses besides address expressions?
2125         !is_visited(adr)) {
2126       address_visited.set(adr->_idx); // Flag as address_visited
2127       Node *shift = adr->in(AddPNode::Offset);
2128       if (!clone_shift(shift, this, mstack, address_visited)) {
2129         mstack.push(shift, Pre_Visit);
2130       }
2131       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
2132       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
2133     } else {
2134       mstack.push(adr, Pre_Visit);
2135     }
2136 
2137     // Clone X+offset as it also folds into most addressing expressions
2138     mstack.push(off, Visit);
2139     mstack.push(m->in(AddPNode::Base), Pre_Visit);
2140     return true;
2141   } else if (clone_shift(off, this, mstack, address_visited)) {
2142     address_visited.test_set(m->_idx); // Flag as address_visited
2143     mstack.push(m->in(AddPNode::Address), Pre_Visit);
2144     mstack.push(m->in(AddPNode::Base), Pre_Visit);
2145     return true;
2146   }
2147   return false;
2148 }
2149 
2150 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
2151   switch (bt) {
2152     case BoolTest::eq:
2153       return Assembler::eq;
2154     case BoolTest::ne:
2155       return Assembler::neq;
2156     case BoolTest::le:
2157     case BoolTest::ule:
2158       return Assembler::le;
2159     case BoolTest::ge:
2160     case BoolTest::uge:
2161       return Assembler::nlt;
2162     case BoolTest::lt:
2163     case BoolTest::ult:
2164       return Assembler::lt;
2165     case BoolTest::gt:
2166     case BoolTest::ugt:
2167       return Assembler::nle;
2168     default : ShouldNotReachHere(); return Assembler::_false;
2169   }
2170 }
2171 
2172 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
2173   switch (bt) {
2174   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
2175   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
2176   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
2177   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
2178   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
2179   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
2180   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
2181   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
2182   }
2183 }
2184 
2185 // Helper methods for MachSpillCopyNode::implementation().
2186 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
2187                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
2188   assert(ireg == Op_VecS || // 32bit vector
2189          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
2190          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
2191          "no non-adjacent vector moves" );
2192   if (cbuf) {
2193     C2_MacroAssembler _masm(cbuf);
2194     switch (ireg) {
2195     case Op_VecS: // copy whole register
2196     case Op_VecD:
2197     case Op_VecX:
2198 #ifndef _LP64
2199       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2200 #else
2201       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2202         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2203       } else {
2204         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2205      }
2206 #endif
2207       break;
2208     case Op_VecY:
2209 #ifndef _LP64
2210       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2211 #else
2212       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2213         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2214       } else {
2215         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2216      }
2217 #endif
2218       break;
2219     case Op_VecZ:
2220       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
2221       break;
2222     default:
2223       ShouldNotReachHere();
2224     }
2225 #ifndef PRODUCT
2226   } else {
2227     switch (ireg) {
2228     case Op_VecS:
2229     case Op_VecD:
2230     case Op_VecX:
2231       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2232       break;
2233     case Op_VecY:
2234     case Op_VecZ:
2235       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2236       break;
2237     default:
2238       ShouldNotReachHere();
2239     }
2240 #endif
2241   }
2242 }
2243 
2244 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
2245                      int stack_offset, int reg, uint ireg, outputStream* st) {
2246   if (cbuf) {
2247     C2_MacroAssembler _masm(cbuf);
2248     if (is_load) {
2249       switch (ireg) {
2250       case Op_VecS:
2251         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2252         break;
2253       case Op_VecD:
2254         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2255         break;
2256       case Op_VecX:
2257 #ifndef _LP64
2258         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2259 #else
2260         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2261           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2262         } else {
2263           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2264           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2265         }
2266 #endif
2267         break;
2268       case Op_VecY:
2269 #ifndef _LP64
2270         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2271 #else
2272         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2273           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2274         } else {
2275           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2276           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2277         }
2278 #endif
2279         break;
2280       case Op_VecZ:
2281         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
2282         break;
2283       default:
2284         ShouldNotReachHere();
2285       }
2286     } else { // store
2287       switch (ireg) {
2288       case Op_VecS:
2289         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2290         break;
2291       case Op_VecD:
2292         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2293         break;
2294       case Op_VecX:
2295 #ifndef _LP64
2296         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2297 #else
2298         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2299           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2300         }
2301         else {
2302           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2303         }
2304 #endif
2305         break;
2306       case Op_VecY:
2307 #ifndef _LP64
2308         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2309 #else
2310         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2311           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2312         }
2313         else {
2314           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2315         }
2316 #endif
2317         break;
2318       case Op_VecZ:
2319         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2320         break;
2321       default:
2322         ShouldNotReachHere();
2323       }
2324     }
2325 #ifndef PRODUCT
2326   } else {
2327     if (is_load) {
2328       switch (ireg) {
2329       case Op_VecS:
2330         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2331         break;
2332       case Op_VecD:
2333         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2334         break;
2335        case Op_VecX:
2336         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2337         break;
2338       case Op_VecY:
2339       case Op_VecZ:
2340         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2341         break;
2342       default:
2343         ShouldNotReachHere();
2344       }
2345     } else { // store
2346       switch (ireg) {
2347       case Op_VecS:
2348         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2349         break;
2350       case Op_VecD:
2351         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2352         break;
2353        case Op_VecX:
2354         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2355         break;
2356       case Op_VecY:
2357       case Op_VecZ:
2358         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2359         break;
2360       default:
2361         ShouldNotReachHere();
2362       }
2363     }
2364 #endif
2365   }
2366 }
2367 
2368 static inline jlong replicate8_imm(int con, int width) {
2369   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
2370   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
2371   int bit_width = width * 8;
2372   jlong val = con;
2373   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
2374   while(bit_width < 64) {
2375     val |= (val << bit_width);
2376     bit_width <<= 1;
2377   }
2378   return val;
2379 }
2380 
2381 #ifndef PRODUCT
2382   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
2383     st->print("nop \t# %d bytes pad for loops and calls", _count);
2384   }
2385 #endif
2386 
2387   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
2388     C2_MacroAssembler _masm(&cbuf);
2389     __ nop(_count);
2390   }
2391 
2392   uint MachNopNode::size(PhaseRegAlloc*) const {
2393     return _count;
2394   }
2395 
2396 #ifndef PRODUCT
2397   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2398     st->print("# breakpoint");
2399   }
2400 #endif
2401 
2402   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
2403     C2_MacroAssembler _masm(&cbuf);
2404     __ int3();
2405   }
2406 
2407   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2408     return MachNode::size(ra_);
2409   }
2410 
2411 %}
2412 
2413 encode %{
2414 
2415   enc_class call_epilog %{
2416     if (VerifyStackAtCalls) {
2417       // Check that stack depth is unchanged: find majik cookie on stack
2418       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2419       C2_MacroAssembler _masm(&cbuf);
2420       Label L;
2421       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2422       __ jccb(Assembler::equal, L);
2423       // Die if stack mismatch
2424       __ int3();
2425       __ bind(L);
2426     }
2427   %}
2428 
2429 %}
2430 
2431 // Operands for bound floating pointer register arguments
2432 operand rxmm0() %{
2433   constraint(ALLOC_IN_RC(xmm0_reg));
2434   match(VecX);
2435   format%{%}
2436   interface(REG_INTER);
2437 %}
2438 
2439 //----------OPERANDS-----------------------------------------------------------
2440 // Operand definitions must precede instruction definitions for correct parsing
2441 // in the ADLC because operands constitute user defined types which are used in
2442 // instruction definitions.
2443 
2444 // Vectors
2445 
2446 // Dummy generic vector class. Should be used for all vector operands.
2447 // Replaced with vec[SDXYZ] during post-selection pass.
2448 operand vec() %{
2449   constraint(ALLOC_IN_RC(dynamic));
2450   match(VecX);
2451   match(VecY);
2452   match(VecZ);
2453   match(VecS);
2454   match(VecD);
2455 
2456   format %{ %}
2457   interface(REG_INTER);
2458 %}
2459 
2460 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
2461 // Replaced with legVec[SDXYZ] during post-selection cleanup.
2462 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
2463 // runtime code generation via reg_class_dynamic.
2464 operand legVec() %{
2465   constraint(ALLOC_IN_RC(dynamic));
2466   match(VecX);
2467   match(VecY);
2468   match(VecZ);
2469   match(VecS);
2470   match(VecD);
2471 
2472   format %{ %}
2473   interface(REG_INTER);
2474 %}
2475 
2476 // Replaces vec during post-selection cleanup. See above.
2477 operand vecS() %{
2478   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
2479   match(VecS);
2480 
2481   format %{ %}
2482   interface(REG_INTER);
2483 %}
2484 
2485 // Replaces legVec during post-selection cleanup. See above.
2486 operand legVecS() %{
2487   constraint(ALLOC_IN_RC(vectors_reg_legacy));
2488   match(VecS);
2489 
2490   format %{ %}
2491   interface(REG_INTER);
2492 %}
2493 
2494 // Replaces vec during post-selection cleanup. See above.
2495 operand vecD() %{
2496   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
2497   match(VecD);
2498 
2499   format %{ %}
2500   interface(REG_INTER);
2501 %}
2502 
2503 // Replaces legVec during post-selection cleanup. See above.
2504 operand legVecD() %{
2505   constraint(ALLOC_IN_RC(vectord_reg_legacy));
2506   match(VecD);
2507 
2508   format %{ %}
2509   interface(REG_INTER);
2510 %}
2511 
2512 // Replaces vec during post-selection cleanup. See above.
2513 operand vecX() %{
2514   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
2515   match(VecX);
2516 
2517   format %{ %}
2518   interface(REG_INTER);
2519 %}
2520 
2521 // Replaces legVec during post-selection cleanup. See above.
2522 operand legVecX() %{
2523   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
2524   match(VecX);
2525 
2526   format %{ %}
2527   interface(REG_INTER);
2528 %}
2529 
2530 // Replaces vec during post-selection cleanup. See above.
2531 operand vecY() %{
2532   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
2533   match(VecY);
2534 
2535   format %{ %}
2536   interface(REG_INTER);
2537 %}
2538 
2539 // Replaces legVec during post-selection cleanup. See above.
2540 operand legVecY() %{
2541   constraint(ALLOC_IN_RC(vectory_reg_legacy));
2542   match(VecY);
2543 
2544   format %{ %}
2545   interface(REG_INTER);
2546 %}
2547 
2548 // Replaces vec during post-selection cleanup. See above.
2549 operand vecZ() %{
2550   constraint(ALLOC_IN_RC(vectorz_reg));
2551   match(VecZ);
2552 
2553   format %{ %}
2554   interface(REG_INTER);
2555 %}
2556 
2557 // Replaces legVec during post-selection cleanup. See above.
2558 operand legVecZ() %{
2559   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
2560   match(VecZ);
2561 
2562   format %{ %}
2563   interface(REG_INTER);
2564 %}
2565 
2566 // Comparison Code for FP conditional move
2567 operand cmpOp_vcmppd() %{
2568   match(Bool);
2569 
2570   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2571             n->as_Bool()->_test._test != BoolTest::no_overflow);
2572   format %{ "" %}
2573   interface(COND_INTER) %{
2574     equal        (0x0, "eq");
2575     less         (0x1, "lt");
2576     less_equal   (0x2, "le");
2577     not_equal    (0xC, "ne");
2578     greater_equal(0xD, "ge");
2579     greater      (0xE, "gt");
2580     //TODO cannot compile (adlc breaks) without two next lines with error:
2581     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2582     // equal' for overflow.
2583     overflow     (0x20, "o");  // not really supported by the instruction
2584     no_overflow  (0x21, "no"); // not really supported by the instruction
2585   %}
2586 %}
2587 
2588 
2589 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2590 
2591 // ============================================================================
2592 
2593 instruct ShouldNotReachHere() %{
2594   match(Halt);
2595   format %{ "stop\t# ShouldNotReachHere" %}
2596   ins_encode %{
2597     if (is_reachable()) {
2598       __ stop(_halt_reason);
2599     }
2600   %}
2601   ins_pipe(pipe_slow);
2602 %}
2603 
2604 // =================================EVEX special===============================
2605 // Existing partial implementation for post-loop multi-versioning computes
2606 // the mask corresponding to tail loop in K1 opmask register. This may then be
2607 // used for predicating instructions in loop body during last post-loop iteration.
2608 // TODO: Remove hard-coded K1 usage while fixing existing post-loop
2609 // multiversioning support.
2610 instruct setMask(rRegI dst, rRegI src, kReg_K1 mask) %{
2611   predicate(PostLoopMultiversioning && Matcher::has_predicated_vectors());
2612   match(Set dst (SetVectMaskI  src));
2613   effect(TEMP dst);
2614   format %{ "setvectmask   $dst, $src" %}
2615   ins_encode %{
2616     __ setvectmask($dst$$Register, $src$$Register, $mask$$KRegister);
2617   %}
2618   ins_pipe(pipe_slow);
2619 %}
2620 
2621 // ============================================================================
2622 
2623 instruct addF_reg(regF dst, regF src) %{
2624   predicate((UseSSE>=1) && (UseAVX == 0));
2625   match(Set dst (AddF dst src));
2626 
2627   format %{ "addss   $dst, $src" %}
2628   ins_cost(150);
2629   ins_encode %{
2630     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2631   %}
2632   ins_pipe(pipe_slow);
2633 %}
2634 
2635 instruct addF_mem(regF dst, memory src) %{
2636   predicate((UseSSE>=1) && (UseAVX == 0));
2637   match(Set dst (AddF dst (LoadF src)));
2638 
2639   format %{ "addss   $dst, $src" %}
2640   ins_cost(150);
2641   ins_encode %{
2642     __ addss($dst$$XMMRegister, $src$$Address);
2643   %}
2644   ins_pipe(pipe_slow);
2645 %}
2646 
2647 instruct addF_imm(regF dst, immF con) %{
2648   predicate((UseSSE>=1) && (UseAVX == 0));
2649   match(Set dst (AddF dst con));
2650   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2651   ins_cost(150);
2652   ins_encode %{
2653     __ addss($dst$$XMMRegister, $constantaddress($con));
2654   %}
2655   ins_pipe(pipe_slow);
2656 %}
2657 
2658 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2659   predicate(UseAVX > 0);
2660   match(Set dst (AddF src1 src2));
2661 
2662   format %{ "vaddss  $dst, $src1, $src2" %}
2663   ins_cost(150);
2664   ins_encode %{
2665     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2666   %}
2667   ins_pipe(pipe_slow);
2668 %}
2669 
2670 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2671   predicate(UseAVX > 0);
2672   match(Set dst (AddF src1 (LoadF src2)));
2673 
2674   format %{ "vaddss  $dst, $src1, $src2" %}
2675   ins_cost(150);
2676   ins_encode %{
2677     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2678   %}
2679   ins_pipe(pipe_slow);
2680 %}
2681 
2682 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2683   predicate(UseAVX > 0);
2684   match(Set dst (AddF src con));
2685 
2686   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2687   ins_cost(150);
2688   ins_encode %{
2689     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2690   %}
2691   ins_pipe(pipe_slow);
2692 %}
2693 
2694 instruct addD_reg(regD dst, regD src) %{
2695   predicate((UseSSE>=2) && (UseAVX == 0));
2696   match(Set dst (AddD dst src));
2697 
2698   format %{ "addsd   $dst, $src" %}
2699   ins_cost(150);
2700   ins_encode %{
2701     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2702   %}
2703   ins_pipe(pipe_slow);
2704 %}
2705 
2706 instruct addD_mem(regD dst, memory src) %{
2707   predicate((UseSSE>=2) && (UseAVX == 0));
2708   match(Set dst (AddD dst (LoadD src)));
2709 
2710   format %{ "addsd   $dst, $src" %}
2711   ins_cost(150);
2712   ins_encode %{
2713     __ addsd($dst$$XMMRegister, $src$$Address);
2714   %}
2715   ins_pipe(pipe_slow);
2716 %}
2717 
2718 instruct addD_imm(regD dst, immD con) %{
2719   predicate((UseSSE>=2) && (UseAVX == 0));
2720   match(Set dst (AddD dst con));
2721   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2722   ins_cost(150);
2723   ins_encode %{
2724     __ addsd($dst$$XMMRegister, $constantaddress($con));
2725   %}
2726   ins_pipe(pipe_slow);
2727 %}
2728 
2729 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2730   predicate(UseAVX > 0);
2731   match(Set dst (AddD src1 src2));
2732 
2733   format %{ "vaddsd  $dst, $src1, $src2" %}
2734   ins_cost(150);
2735   ins_encode %{
2736     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2737   %}
2738   ins_pipe(pipe_slow);
2739 %}
2740 
2741 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2742   predicate(UseAVX > 0);
2743   match(Set dst (AddD src1 (LoadD src2)));
2744 
2745   format %{ "vaddsd  $dst, $src1, $src2" %}
2746   ins_cost(150);
2747   ins_encode %{
2748     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2749   %}
2750   ins_pipe(pipe_slow);
2751 %}
2752 
2753 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2754   predicate(UseAVX > 0);
2755   match(Set dst (AddD src con));
2756 
2757   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2758   ins_cost(150);
2759   ins_encode %{
2760     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2761   %}
2762   ins_pipe(pipe_slow);
2763 %}
2764 
2765 instruct subF_reg(regF dst, regF src) %{
2766   predicate((UseSSE>=1) && (UseAVX == 0));
2767   match(Set dst (SubF dst src));
2768 
2769   format %{ "subss   $dst, $src" %}
2770   ins_cost(150);
2771   ins_encode %{
2772     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2773   %}
2774   ins_pipe(pipe_slow);
2775 %}
2776 
2777 instruct subF_mem(regF dst, memory src) %{
2778   predicate((UseSSE>=1) && (UseAVX == 0));
2779   match(Set dst (SubF dst (LoadF src)));
2780 
2781   format %{ "subss   $dst, $src" %}
2782   ins_cost(150);
2783   ins_encode %{
2784     __ subss($dst$$XMMRegister, $src$$Address);
2785   %}
2786   ins_pipe(pipe_slow);
2787 %}
2788 
2789 instruct subF_imm(regF dst, immF con) %{
2790   predicate((UseSSE>=1) && (UseAVX == 0));
2791   match(Set dst (SubF dst con));
2792   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2793   ins_cost(150);
2794   ins_encode %{
2795     __ subss($dst$$XMMRegister, $constantaddress($con));
2796   %}
2797   ins_pipe(pipe_slow);
2798 %}
2799 
2800 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2801   predicate(UseAVX > 0);
2802   match(Set dst (SubF src1 src2));
2803 
2804   format %{ "vsubss  $dst, $src1, $src2" %}
2805   ins_cost(150);
2806   ins_encode %{
2807     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2808   %}
2809   ins_pipe(pipe_slow);
2810 %}
2811 
2812 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2813   predicate(UseAVX > 0);
2814   match(Set dst (SubF src1 (LoadF src2)));
2815 
2816   format %{ "vsubss  $dst, $src1, $src2" %}
2817   ins_cost(150);
2818   ins_encode %{
2819     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2820   %}
2821   ins_pipe(pipe_slow);
2822 %}
2823 
2824 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2825   predicate(UseAVX > 0);
2826   match(Set dst (SubF src con));
2827 
2828   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2829   ins_cost(150);
2830   ins_encode %{
2831     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2832   %}
2833   ins_pipe(pipe_slow);
2834 %}
2835 
2836 instruct subD_reg(regD dst, regD src) %{
2837   predicate((UseSSE>=2) && (UseAVX == 0));
2838   match(Set dst (SubD dst src));
2839 
2840   format %{ "subsd   $dst, $src" %}
2841   ins_cost(150);
2842   ins_encode %{
2843     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2844   %}
2845   ins_pipe(pipe_slow);
2846 %}
2847 
2848 instruct subD_mem(regD dst, memory src) %{
2849   predicate((UseSSE>=2) && (UseAVX == 0));
2850   match(Set dst (SubD dst (LoadD src)));
2851 
2852   format %{ "subsd   $dst, $src" %}
2853   ins_cost(150);
2854   ins_encode %{
2855     __ subsd($dst$$XMMRegister, $src$$Address);
2856   %}
2857   ins_pipe(pipe_slow);
2858 %}
2859 
2860 instruct subD_imm(regD dst, immD con) %{
2861   predicate((UseSSE>=2) && (UseAVX == 0));
2862   match(Set dst (SubD dst con));
2863   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2864   ins_cost(150);
2865   ins_encode %{
2866     __ subsd($dst$$XMMRegister, $constantaddress($con));
2867   %}
2868   ins_pipe(pipe_slow);
2869 %}
2870 
2871 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2872   predicate(UseAVX > 0);
2873   match(Set dst (SubD src1 src2));
2874 
2875   format %{ "vsubsd  $dst, $src1, $src2" %}
2876   ins_cost(150);
2877   ins_encode %{
2878     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2879   %}
2880   ins_pipe(pipe_slow);
2881 %}
2882 
2883 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2884   predicate(UseAVX > 0);
2885   match(Set dst (SubD src1 (LoadD src2)));
2886 
2887   format %{ "vsubsd  $dst, $src1, $src2" %}
2888   ins_cost(150);
2889   ins_encode %{
2890     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2891   %}
2892   ins_pipe(pipe_slow);
2893 %}
2894 
2895 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2896   predicate(UseAVX > 0);
2897   match(Set dst (SubD src con));
2898 
2899   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2900   ins_cost(150);
2901   ins_encode %{
2902     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2903   %}
2904   ins_pipe(pipe_slow);
2905 %}
2906 
2907 instruct mulF_reg(regF dst, regF src) %{
2908   predicate((UseSSE>=1) && (UseAVX == 0));
2909   match(Set dst (MulF dst src));
2910 
2911   format %{ "mulss   $dst, $src" %}
2912   ins_cost(150);
2913   ins_encode %{
2914     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2915   %}
2916   ins_pipe(pipe_slow);
2917 %}
2918 
2919 instruct mulF_mem(regF dst, memory src) %{
2920   predicate((UseSSE>=1) && (UseAVX == 0));
2921   match(Set dst (MulF dst (LoadF src)));
2922 
2923   format %{ "mulss   $dst, $src" %}
2924   ins_cost(150);
2925   ins_encode %{
2926     __ mulss($dst$$XMMRegister, $src$$Address);
2927   %}
2928   ins_pipe(pipe_slow);
2929 %}
2930 
2931 instruct mulF_imm(regF dst, immF con) %{
2932   predicate((UseSSE>=1) && (UseAVX == 0));
2933   match(Set dst (MulF dst con));
2934   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2935   ins_cost(150);
2936   ins_encode %{
2937     __ mulss($dst$$XMMRegister, $constantaddress($con));
2938   %}
2939   ins_pipe(pipe_slow);
2940 %}
2941 
2942 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2943   predicate(UseAVX > 0);
2944   match(Set dst (MulF src1 src2));
2945 
2946   format %{ "vmulss  $dst, $src1, $src2" %}
2947   ins_cost(150);
2948   ins_encode %{
2949     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2950   %}
2951   ins_pipe(pipe_slow);
2952 %}
2953 
2954 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2955   predicate(UseAVX > 0);
2956   match(Set dst (MulF src1 (LoadF src2)));
2957 
2958   format %{ "vmulss  $dst, $src1, $src2" %}
2959   ins_cost(150);
2960   ins_encode %{
2961     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2962   %}
2963   ins_pipe(pipe_slow);
2964 %}
2965 
2966 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2967   predicate(UseAVX > 0);
2968   match(Set dst (MulF src con));
2969 
2970   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2971   ins_cost(150);
2972   ins_encode %{
2973     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2974   %}
2975   ins_pipe(pipe_slow);
2976 %}
2977 
2978 instruct mulD_reg(regD dst, regD src) %{
2979   predicate((UseSSE>=2) && (UseAVX == 0));
2980   match(Set dst (MulD dst src));
2981 
2982   format %{ "mulsd   $dst, $src" %}
2983   ins_cost(150);
2984   ins_encode %{
2985     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2986   %}
2987   ins_pipe(pipe_slow);
2988 %}
2989 
2990 instruct mulD_mem(regD dst, memory src) %{
2991   predicate((UseSSE>=2) && (UseAVX == 0));
2992   match(Set dst (MulD dst (LoadD src)));
2993 
2994   format %{ "mulsd   $dst, $src" %}
2995   ins_cost(150);
2996   ins_encode %{
2997     __ mulsd($dst$$XMMRegister, $src$$Address);
2998   %}
2999   ins_pipe(pipe_slow);
3000 %}
3001 
3002 instruct mulD_imm(regD dst, immD con) %{
3003   predicate((UseSSE>=2) && (UseAVX == 0));
3004   match(Set dst (MulD dst con));
3005   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3006   ins_cost(150);
3007   ins_encode %{
3008     __ mulsd($dst$$XMMRegister, $constantaddress($con));
3009   %}
3010   ins_pipe(pipe_slow);
3011 %}
3012 
3013 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
3014   predicate(UseAVX > 0);
3015   match(Set dst (MulD src1 src2));
3016 
3017   format %{ "vmulsd  $dst, $src1, $src2" %}
3018   ins_cost(150);
3019   ins_encode %{
3020     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3021   %}
3022   ins_pipe(pipe_slow);
3023 %}
3024 
3025 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
3026   predicate(UseAVX > 0);
3027   match(Set dst (MulD src1 (LoadD src2)));
3028 
3029   format %{ "vmulsd  $dst, $src1, $src2" %}
3030   ins_cost(150);
3031   ins_encode %{
3032     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3033   %}
3034   ins_pipe(pipe_slow);
3035 %}
3036 
3037 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
3038   predicate(UseAVX > 0);
3039   match(Set dst (MulD src con));
3040 
3041   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3042   ins_cost(150);
3043   ins_encode %{
3044     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3045   %}
3046   ins_pipe(pipe_slow);
3047 %}
3048 
3049 instruct divF_reg(regF dst, regF src) %{
3050   predicate((UseSSE>=1) && (UseAVX == 0));
3051   match(Set dst (DivF dst src));
3052 
3053   format %{ "divss   $dst, $src" %}
3054   ins_cost(150);
3055   ins_encode %{
3056     __ divss($dst$$XMMRegister, $src$$XMMRegister);
3057   %}
3058   ins_pipe(pipe_slow);
3059 %}
3060 
3061 instruct divF_mem(regF dst, memory src) %{
3062   predicate((UseSSE>=1) && (UseAVX == 0));
3063   match(Set dst (DivF dst (LoadF src)));
3064 
3065   format %{ "divss   $dst, $src" %}
3066   ins_cost(150);
3067   ins_encode %{
3068     __ divss($dst$$XMMRegister, $src$$Address);
3069   %}
3070   ins_pipe(pipe_slow);
3071 %}
3072 
3073 instruct divF_imm(regF dst, immF con) %{
3074   predicate((UseSSE>=1) && (UseAVX == 0));
3075   match(Set dst (DivF dst con));
3076   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3077   ins_cost(150);
3078   ins_encode %{
3079     __ divss($dst$$XMMRegister, $constantaddress($con));
3080   %}
3081   ins_pipe(pipe_slow);
3082 %}
3083 
3084 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
3085   predicate(UseAVX > 0);
3086   match(Set dst (DivF src1 src2));
3087 
3088   format %{ "vdivss  $dst, $src1, $src2" %}
3089   ins_cost(150);
3090   ins_encode %{
3091     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3092   %}
3093   ins_pipe(pipe_slow);
3094 %}
3095 
3096 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
3097   predicate(UseAVX > 0);
3098   match(Set dst (DivF src1 (LoadF src2)));
3099 
3100   format %{ "vdivss  $dst, $src1, $src2" %}
3101   ins_cost(150);
3102   ins_encode %{
3103     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3104   %}
3105   ins_pipe(pipe_slow);
3106 %}
3107 
3108 instruct divF_reg_imm(regF dst, regF src, immF con) %{
3109   predicate(UseAVX > 0);
3110   match(Set dst (DivF src con));
3111 
3112   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3113   ins_cost(150);
3114   ins_encode %{
3115     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3116   %}
3117   ins_pipe(pipe_slow);
3118 %}
3119 
3120 instruct divD_reg(regD dst, regD src) %{
3121   predicate((UseSSE>=2) && (UseAVX == 0));
3122   match(Set dst (DivD dst src));
3123 
3124   format %{ "divsd   $dst, $src" %}
3125   ins_cost(150);
3126   ins_encode %{
3127     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
3128   %}
3129   ins_pipe(pipe_slow);
3130 %}
3131 
3132 instruct divD_mem(regD dst, memory src) %{
3133   predicate((UseSSE>=2) && (UseAVX == 0));
3134   match(Set dst (DivD dst (LoadD src)));
3135 
3136   format %{ "divsd   $dst, $src" %}
3137   ins_cost(150);
3138   ins_encode %{
3139     __ divsd($dst$$XMMRegister, $src$$Address);
3140   %}
3141   ins_pipe(pipe_slow);
3142 %}
3143 
3144 instruct divD_imm(regD dst, immD con) %{
3145   predicate((UseSSE>=2) && (UseAVX == 0));
3146   match(Set dst (DivD dst con));
3147   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3148   ins_cost(150);
3149   ins_encode %{
3150     __ divsd($dst$$XMMRegister, $constantaddress($con));
3151   %}
3152   ins_pipe(pipe_slow);
3153 %}
3154 
3155 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
3156   predicate(UseAVX > 0);
3157   match(Set dst (DivD src1 src2));
3158 
3159   format %{ "vdivsd  $dst, $src1, $src2" %}
3160   ins_cost(150);
3161   ins_encode %{
3162     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3163   %}
3164   ins_pipe(pipe_slow);
3165 %}
3166 
3167 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
3168   predicate(UseAVX > 0);
3169   match(Set dst (DivD src1 (LoadD src2)));
3170 
3171   format %{ "vdivsd  $dst, $src1, $src2" %}
3172   ins_cost(150);
3173   ins_encode %{
3174     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3175   %}
3176   ins_pipe(pipe_slow);
3177 %}
3178 
3179 instruct divD_reg_imm(regD dst, regD src, immD con) %{
3180   predicate(UseAVX > 0);
3181   match(Set dst (DivD src con));
3182 
3183   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3184   ins_cost(150);
3185   ins_encode %{
3186     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3187   %}
3188   ins_pipe(pipe_slow);
3189 %}
3190 
3191 instruct absF_reg(regF dst) %{
3192   predicate((UseSSE>=1) && (UseAVX == 0));
3193   match(Set dst (AbsF dst));
3194   ins_cost(150);
3195   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
3196   ins_encode %{
3197     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
3198   %}
3199   ins_pipe(pipe_slow);
3200 %}
3201 
3202 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
3203   predicate(UseAVX > 0);
3204   match(Set dst (AbsF src));
3205   ins_cost(150);
3206   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
3207   ins_encode %{
3208     int vlen_enc = Assembler::AVX_128bit;
3209     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
3210               ExternalAddress(float_signmask()), vlen_enc);
3211   %}
3212   ins_pipe(pipe_slow);
3213 %}
3214 
3215 instruct absD_reg(regD dst) %{
3216   predicate((UseSSE>=2) && (UseAVX == 0));
3217   match(Set dst (AbsD dst));
3218   ins_cost(150);
3219   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
3220             "# abs double by sign masking" %}
3221   ins_encode %{
3222     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
3223   %}
3224   ins_pipe(pipe_slow);
3225 %}
3226 
3227 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
3228   predicate(UseAVX > 0);
3229   match(Set dst (AbsD src));
3230   ins_cost(150);
3231   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
3232             "# abs double by sign masking" %}
3233   ins_encode %{
3234     int vlen_enc = Assembler::AVX_128bit;
3235     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
3236               ExternalAddress(double_signmask()), vlen_enc);
3237   %}
3238   ins_pipe(pipe_slow);
3239 %}
3240 
3241 instruct negF_reg(regF dst) %{
3242   predicate((UseSSE>=1) && (UseAVX == 0));
3243   match(Set dst (NegF dst));
3244   ins_cost(150);
3245   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
3246   ins_encode %{
3247     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
3248   %}
3249   ins_pipe(pipe_slow);
3250 %}
3251 
3252 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
3253   predicate(UseAVX > 0);
3254   match(Set dst (NegF src));
3255   ins_cost(150);
3256   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
3257   ins_encode %{
3258     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
3259                  ExternalAddress(float_signflip()));
3260   %}
3261   ins_pipe(pipe_slow);
3262 %}
3263 
3264 instruct negD_reg(regD dst) %{
3265   predicate((UseSSE>=2) && (UseAVX == 0));
3266   match(Set dst (NegD dst));
3267   ins_cost(150);
3268   format %{ "xorpd   $dst, [0x8000000000000000]\t"
3269             "# neg double by sign flipping" %}
3270   ins_encode %{
3271     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
3272   %}
3273   ins_pipe(pipe_slow);
3274 %}
3275 
3276 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
3277   predicate(UseAVX > 0);
3278   match(Set dst (NegD src));
3279   ins_cost(150);
3280   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
3281             "# neg double by sign flipping" %}
3282   ins_encode %{
3283     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
3284                  ExternalAddress(double_signflip()));
3285   %}
3286   ins_pipe(pipe_slow);
3287 %}
3288 
3289 // sqrtss instruction needs destination register to be pre initialized for best performance
3290 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3291 instruct sqrtF_reg(regF dst) %{
3292   predicate(UseSSE>=1);
3293   match(Set dst (SqrtF dst));
3294   format %{ "sqrtss  $dst, $dst" %}
3295   ins_encode %{
3296     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
3297   %}
3298   ins_pipe(pipe_slow);
3299 %}
3300 
3301 // sqrtsd instruction needs destination register to be pre initialized for best performance
3302 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3303 instruct sqrtD_reg(regD dst) %{
3304   predicate(UseSSE>=2);
3305   match(Set dst (SqrtD dst));
3306   format %{ "sqrtsd  $dst, $dst" %}
3307   ins_encode %{
3308     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
3309   %}
3310   ins_pipe(pipe_slow);
3311 %}
3312 
3313 // ---------------------------------------- VectorReinterpret ------------------------------------
3314 
3315 instruct reinterpret(vec dst) %{
3316   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
3317   match(Set dst (VectorReinterpret dst));
3318   ins_cost(125);
3319   format %{ "vector_reinterpret $dst\t!" %}
3320   ins_encode %{
3321     // empty
3322   %}
3323   ins_pipe( pipe_slow );
3324 %}
3325 
3326 instruct reinterpret_expand(vec dst, vec src, rRegP scratch) %{
3327   predicate(UseAVX == 0 &&
3328             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3329   match(Set dst (VectorReinterpret src));
3330   ins_cost(125);
3331   effect(TEMP dst, TEMP scratch);
3332   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3333   ins_encode %{
3334     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
3335     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
3336 
3337     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
3338     if (src_vlen_in_bytes == 4) {
3339       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3340     } else {
3341       assert(src_vlen_in_bytes == 8, "");
3342       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), $scratch$$Register);
3343     }
3344     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3345   %}
3346   ins_pipe( pipe_slow );
3347 %}
3348 
3349 instruct vreinterpret_expand4(legVec dst, vec src, rRegP scratch) %{
3350   predicate(UseAVX > 0 &&
3351             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
3352             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3353   match(Set dst (VectorReinterpret src));
3354   ins_cost(125);
3355   effect(TEMP scratch);
3356   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3357   ins_encode %{
3358     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, $scratch$$Register);
3359   %}
3360   ins_pipe( pipe_slow );
3361 %}
3362 
3363 
3364 instruct vreinterpret_expand(legVec dst, vec src) %{
3365   predicate(UseAVX > 0 &&
3366             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
3367             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3368   match(Set dst (VectorReinterpret src));
3369   ins_cost(125);
3370   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
3371   ins_encode %{
3372     switch (Matcher::vector_length_in_bytes(this, $src)) {
3373       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3374       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3375       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3376       default: ShouldNotReachHere();
3377     }
3378   %}
3379   ins_pipe( pipe_slow );
3380 %}
3381 
3382 instruct reinterpret_shrink(vec dst, legVec src) %{
3383   predicate(Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
3384   match(Set dst (VectorReinterpret src));
3385   ins_cost(125);
3386   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
3387   ins_encode %{
3388     switch (Matcher::vector_length_in_bytes(this)) {
3389       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
3390       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3391       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3392       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3393       default: ShouldNotReachHere();
3394     }
3395   %}
3396   ins_pipe( pipe_slow );
3397 %}
3398 
3399 // ----------------------------------------------------------------------------------------------------
3400 
3401 #ifdef _LP64
3402 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
3403   match(Set dst (RoundDoubleMode src rmode));
3404   format %{ "roundsd $dst,$src" %}
3405   ins_cost(150);
3406   ins_encode %{
3407     assert(UseSSE >= 4, "required");
3408     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
3409   %}
3410   ins_pipe(pipe_slow);
3411 %}
3412 
3413 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
3414   match(Set dst (RoundDoubleMode (LoadD src) rmode));
3415   format %{ "roundsd $dst,$src" %}
3416   ins_cost(150);
3417   ins_encode %{
3418     assert(UseSSE >= 4, "required");
3419     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
3420   %}
3421   ins_pipe(pipe_slow);
3422 %}
3423 
3424 instruct roundD_imm(legRegD dst, immD con, immU8 rmode, rRegI scratch_reg) %{
3425   match(Set dst (RoundDoubleMode con rmode));
3426   effect(TEMP scratch_reg);
3427   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
3428   ins_cost(150);
3429   ins_encode %{
3430     assert(UseSSE >= 4, "required");
3431     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, $scratch_reg$$Register);
3432   %}
3433   ins_pipe(pipe_slow);
3434 %}
3435 
3436 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
3437   predicate(Matcher::vector_length(n) < 8);
3438   match(Set dst (RoundDoubleModeV src rmode));
3439   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
3440   ins_encode %{
3441     assert(UseAVX > 0, "required");
3442     int vlen_enc = vector_length_encoding(this);
3443     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
3444   %}
3445   ins_pipe( pipe_slow );
3446 %}
3447 
3448 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
3449   predicate(Matcher::vector_length(n) == 8);
3450   match(Set dst (RoundDoubleModeV src rmode));
3451   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
3452   ins_encode %{
3453     assert(UseAVX > 2, "required");
3454     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
3455   %}
3456   ins_pipe( pipe_slow );
3457 %}
3458 
3459 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
3460   predicate(Matcher::vector_length(n) < 8);
3461   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3462   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
3463   ins_encode %{
3464     assert(UseAVX > 0, "required");
3465     int vlen_enc = vector_length_encoding(this);
3466     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
3467   %}
3468   ins_pipe( pipe_slow );
3469 %}
3470 
3471 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
3472   predicate(Matcher::vector_length(n) == 8);
3473   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3474   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
3475   ins_encode %{
3476     assert(UseAVX > 2, "required");
3477     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
3478   %}
3479   ins_pipe( pipe_slow );
3480 %}
3481 #endif // _LP64
3482 
3483 instruct onspinwait() %{
3484   match(OnSpinWait);
3485   ins_cost(200);
3486 
3487   format %{
3488     $$template
3489     $$emit$$"pause\t! membar_onspinwait"
3490   %}
3491   ins_encode %{
3492     __ pause();
3493   %}
3494   ins_pipe(pipe_slow);
3495 %}
3496 
3497 // a * b + c
3498 instruct fmaD_reg(regD a, regD b, regD c) %{
3499   predicate(UseFMA);
3500   match(Set c (FmaD  c (Binary a b)));
3501   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
3502   ins_cost(150);
3503   ins_encode %{
3504     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3505   %}
3506   ins_pipe( pipe_slow );
3507 %}
3508 
3509 // a * b + c
3510 instruct fmaF_reg(regF a, regF b, regF c) %{
3511   predicate(UseFMA);
3512   match(Set c (FmaF  c (Binary a b)));
3513   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
3514   ins_cost(150);
3515   ins_encode %{
3516     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3517   %}
3518   ins_pipe( pipe_slow );
3519 %}
3520 
3521 // ====================VECTOR INSTRUCTIONS=====================================
3522 
3523 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
3524 instruct MoveVec2Leg(legVec dst, vec src) %{
3525   match(Set dst src);
3526   format %{ "" %}
3527   ins_encode %{
3528     ShouldNotReachHere();
3529   %}
3530   ins_pipe( fpu_reg_reg );
3531 %}
3532 
3533 instruct MoveLeg2Vec(vec dst, legVec src) %{
3534   match(Set dst src);
3535   format %{ "" %}
3536   ins_encode %{
3537     ShouldNotReachHere();
3538   %}
3539   ins_pipe( fpu_reg_reg );
3540 %}
3541 
3542 // ============================================================================
3543 
3544 // Load vectors generic operand pattern
3545 instruct loadV(vec dst, memory mem) %{
3546   match(Set dst (LoadVector mem));
3547   ins_cost(125);
3548   format %{ "load_vector $dst,$mem" %}
3549   ins_encode %{
3550     switch (Matcher::vector_length_in_bytes(this)) {
3551       case  4: __ movdl    ($dst$$XMMRegister, $mem$$Address); break;
3552       case  8: __ movq     ($dst$$XMMRegister, $mem$$Address); break;
3553       case 16: __ movdqu   ($dst$$XMMRegister, $mem$$Address); break;
3554       case 32: __ vmovdqu  ($dst$$XMMRegister, $mem$$Address); break;
3555       case 64: __ evmovdqul($dst$$XMMRegister, $mem$$Address, Assembler::AVX_512bit); break;
3556       default: ShouldNotReachHere();
3557     }
3558   %}
3559   ins_pipe( pipe_slow );
3560 %}
3561 
3562 // Store vectors generic operand pattern.
3563 instruct storeV(memory mem, vec src) %{
3564   match(Set mem (StoreVector mem src));
3565   ins_cost(145);
3566   format %{ "store_vector $mem,$src\n\t" %}
3567   ins_encode %{
3568     switch (Matcher::vector_length_in_bytes(this, $src)) {
3569       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
3570       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
3571       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
3572       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
3573       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
3574       default: ShouldNotReachHere();
3575     }
3576   %}
3577   ins_pipe( pipe_slow );
3578 %}
3579 
3580 // ---------------------------------------- Gather ------------------------------------
3581 
3582 // Gather INT, LONG, FLOAT, DOUBLE
3583 
3584 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
3585   predicate(Matcher::vector_length_in_bytes(n) <= 32);
3586   match(Set dst (LoadVectorGather mem idx));
3587   effect(TEMP dst, TEMP tmp, TEMP mask);
3588   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
3589   ins_encode %{
3590     assert(UseAVX >= 2, "sanity");
3591 
3592     int vlen_enc = vector_length_encoding(this);
3593     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3594 
3595     assert(Matcher::vector_length_in_bytes(this) >= 16, "sanity");
3596     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3597 
3598     if (vlen_enc == Assembler::AVX_128bit) {
3599       __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3600     } else {
3601       __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3602     }
3603     __ lea($tmp$$Register, $mem$$Address);
3604     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
3605   %}
3606   ins_pipe( pipe_slow );
3607 %}
3608 
3609 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
3610   predicate(Matcher::vector_length_in_bytes(n) == 64);
3611   match(Set dst (LoadVectorGather mem idx));
3612   effect(TEMP dst, TEMP tmp, TEMP ktmp);
3613   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and k2 as TEMP" %}
3614   ins_encode %{
3615     assert(UseAVX > 2, "sanity");
3616 
3617     int vlen_enc = vector_length_encoding(this);
3618     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3619 
3620     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3621 
3622     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3623     __ lea($tmp$$Register, $mem$$Address);
3624     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
3625   %}
3626   ins_pipe( pipe_slow );
3627 %}
3628 
3629 // ====================Scatter=======================================
3630 
3631 // Scatter INT, LONG, FLOAT, DOUBLE
3632 
3633 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
3634   predicate(UseAVX > 2);
3635   match(Set mem (StoreVectorScatter mem (Binary src idx)));
3636   effect(TEMP tmp, TEMP ktmp);
3637   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
3638   ins_encode %{
3639     int vlen_enc = vector_length_encoding(this, $src);
3640     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
3641 
3642     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
3643     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3644 
3645     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3646     __ lea($tmp$$Register, $mem$$Address);
3647     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
3648   %}
3649   ins_pipe( pipe_slow );
3650 %}
3651 
3652 // ====================REPLICATE=======================================
3653 
3654 // Replicate byte scalar to be vector
3655 instruct ReplB_reg(vec dst, rRegI src) %{
3656   match(Set dst (ReplicateB src));
3657   format %{ "replicateB $dst,$src" %}
3658   ins_encode %{
3659     uint vlen = Matcher::vector_length(this);
3660     if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
3661       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
3662       int vlen_enc = vector_length_encoding(this);
3663       __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
3664     } else if (VM_Version::supports_avx2()) {
3665       int vlen_enc = vector_length_encoding(this);
3666       __ movdl($dst$$XMMRegister, $src$$Register);
3667       __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3668     } else {
3669       __ movdl($dst$$XMMRegister, $src$$Register);
3670       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3671       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3672       if (vlen >= 16) {
3673         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3674         if (vlen >= 32) {
3675           assert(vlen == 32, "sanity");
3676           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3677         }
3678       }
3679     }
3680   %}
3681   ins_pipe( pipe_slow );
3682 %}
3683 
3684 instruct ReplB_mem(vec dst, memory mem) %{
3685   predicate(VM_Version::supports_avx2());
3686   match(Set dst (ReplicateB (LoadB mem)));
3687   format %{ "replicateB $dst,$mem" %}
3688   ins_encode %{
3689     int vlen_enc = vector_length_encoding(this);
3690     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
3691   %}
3692   ins_pipe( pipe_slow );
3693 %}
3694 
3695 instruct ReplB_imm(vec dst, immI con) %{
3696   match(Set dst (ReplicateB con));
3697   format %{ "replicateB $dst,$con" %}
3698   ins_encode %{
3699     uint vlen = Matcher::vector_length(this);
3700     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 1));
3701     if (vlen == 4) {
3702       __ movdl($dst$$XMMRegister, const_addr);
3703     } else {
3704       __ movq($dst$$XMMRegister, const_addr);
3705       if (vlen >= 16) {
3706         if (VM_Version::supports_avx2()) {
3707           int vlen_enc = vector_length_encoding(this);
3708           __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3709         } else {
3710           assert(vlen == 16, "sanity");
3711           __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3712         }
3713       }
3714     }
3715   %}
3716   ins_pipe( pipe_slow );
3717 %}
3718 
3719 // Replicate byte scalar zero to be vector
3720 instruct ReplB_zero(vec dst, immI_0 zero) %{
3721   match(Set dst (ReplicateB zero));
3722   format %{ "replicateB $dst,$zero" %}
3723   ins_encode %{
3724     uint vlen = Matcher::vector_length(this);
3725     if (vlen <= 16) {
3726       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3727     } else {
3728       // Use vpxor since AVX512F does not have 512bit vxorpd (requires AVX512DQ).
3729       int vlen_enc = vector_length_encoding(this);
3730       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3731     }
3732   %}
3733   ins_pipe( fpu_reg_reg );
3734 %}
3735 
3736 // ====================ReplicateS=======================================
3737 
3738 instruct ReplS_reg(vec dst, rRegI src) %{
3739   match(Set dst (ReplicateS src));
3740   format %{ "replicateS $dst,$src" %}
3741   ins_encode %{
3742     uint vlen = Matcher::vector_length(this);
3743     if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
3744       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
3745       int vlen_enc = vector_length_encoding(this);
3746       __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
3747     } else if (VM_Version::supports_avx2()) {
3748       int vlen_enc = vector_length_encoding(this);
3749       __ movdl($dst$$XMMRegister, $src$$Register);
3750       __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3751     } else {
3752       __ movdl($dst$$XMMRegister, $src$$Register);
3753       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3754       if (vlen >= 8) {
3755         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3756         if (vlen >= 16) {
3757           assert(vlen == 16, "sanity");
3758           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3759         }
3760       }
3761     }
3762   %}
3763   ins_pipe( pipe_slow );
3764 %}
3765 
3766 instruct ReplS_mem(vec dst, memory mem) %{
3767   predicate(VM_Version::supports_avx2());
3768   match(Set dst (ReplicateS (LoadS mem)));
3769   format %{ "replicateS $dst,$mem" %}
3770   ins_encode %{
3771     int vlen_enc = vector_length_encoding(this);
3772     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
3773   %}
3774   ins_pipe( pipe_slow );
3775 %}
3776 
3777 instruct ReplS_imm(vec dst, immI con) %{
3778   match(Set dst (ReplicateS con));
3779   format %{ "replicateS $dst,$con" %}
3780   ins_encode %{
3781     uint vlen = Matcher::vector_length(this);
3782     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 2));
3783     if (vlen == 2) {
3784       __ movdl($dst$$XMMRegister, const_addr);
3785     } else {
3786       __ movq($dst$$XMMRegister, const_addr);
3787       if (vlen >= 8) {
3788         if (VM_Version::supports_avx2()) {
3789           int vlen_enc = vector_length_encoding(this);
3790           __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3791         } else {
3792           assert(vlen == 8, "sanity");
3793           __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3794         }
3795       }
3796     }
3797   %}
3798   ins_pipe( fpu_reg_reg );
3799 %}
3800 
3801 instruct ReplS_zero(vec dst, immI_0 zero) %{
3802   match(Set dst (ReplicateS zero));
3803   format %{ "replicateS $dst,$zero" %}
3804   ins_encode %{
3805     uint vlen = Matcher::vector_length(this);
3806     if (vlen <= 8) {
3807       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3808     } else {
3809       int vlen_enc = vector_length_encoding(this);
3810       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3811     }
3812   %}
3813   ins_pipe( fpu_reg_reg );
3814 %}
3815 
3816 // ====================ReplicateI=======================================
3817 
3818 instruct ReplI_reg(vec dst, rRegI src) %{
3819   match(Set dst (ReplicateI src));
3820   format %{ "replicateI $dst,$src" %}
3821   ins_encode %{
3822     uint vlen = Matcher::vector_length(this);
3823     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3824       int vlen_enc = vector_length_encoding(this);
3825       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
3826     } else if (VM_Version::supports_avx2()) {
3827       int vlen_enc = vector_length_encoding(this);
3828       __ movdl($dst$$XMMRegister, $src$$Register);
3829       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3830     } else {
3831       __ movdl($dst$$XMMRegister, $src$$Register);
3832       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3833       if (vlen >= 8) {
3834         assert(vlen == 8, "sanity");
3835         __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3836       }
3837     }
3838   %}
3839   ins_pipe( pipe_slow );
3840 %}
3841 
3842 instruct ReplI_mem(vec dst, memory mem) %{
3843   match(Set dst (ReplicateI (LoadI mem)));
3844   format %{ "replicateI $dst,$mem" %}
3845   ins_encode %{
3846     uint vlen = Matcher::vector_length(this);
3847     if (vlen <= 4) {
3848       __ movdl($dst$$XMMRegister, $mem$$Address);
3849       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3850     } else {
3851       assert(VM_Version::supports_avx2(), "sanity");
3852       int vlen_enc = vector_length_encoding(this);
3853       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
3854     }
3855   %}
3856   ins_pipe( pipe_slow );
3857 %}
3858 
3859 instruct ReplI_imm(vec dst, immI con) %{
3860   match(Set dst (ReplicateI con));
3861   format %{ "replicateI $dst,$con" %}
3862   ins_encode %{
3863     uint vlen = Matcher::vector_length(this);
3864     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 4));
3865     if (vlen <= 4) {
3866       __ movq($dst$$XMMRegister, const_addr);
3867       if (vlen == 4) {
3868         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3869       }
3870     } else {
3871       assert(VM_Version::supports_avx2(), "sanity");
3872       int vlen_enc = vector_length_encoding(this);
3873       __ movq($dst$$XMMRegister, const_addr);
3874       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3875     }
3876   %}
3877   ins_pipe( pipe_slow );
3878 %}
3879 
3880 // Replicate integer (4 byte) scalar zero to be vector
3881 instruct ReplI_zero(vec dst, immI_0 zero) %{
3882   match(Set dst (ReplicateI zero));
3883   format %{ "replicateI $dst,$zero" %}
3884   ins_encode %{
3885     uint vlen = Matcher::vector_length(this);
3886     if (vlen <= 4) {
3887       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3888     } else {
3889       int vlen_enc = vector_length_encoding(this);
3890       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3891     }
3892   %}
3893   ins_pipe( fpu_reg_reg );
3894 %}
3895 
3896 instruct ReplI_M1(vec dst, immI_M1 con) %{
3897   predicate(UseAVX > 0);
3898   match(Set dst (ReplicateB con));
3899   match(Set dst (ReplicateS con));
3900   match(Set dst (ReplicateI con));
3901   effect(TEMP dst);
3902   format %{ "vallones $dst" %}
3903   ins_encode %{
3904     int vector_len = vector_length_encoding(this);
3905     __ vallones($dst$$XMMRegister, vector_len);
3906   %}
3907   ins_pipe( pipe_slow );
3908 %}
3909 
3910 // ====================ReplicateL=======================================
3911 
3912 #ifdef _LP64
3913 // Replicate long (8 byte) scalar to be vector
3914 instruct ReplL_reg(vec dst, rRegL src) %{
3915   match(Set dst (ReplicateL src));
3916   format %{ "replicateL $dst,$src" %}
3917   ins_encode %{
3918     uint vlen = Matcher::vector_length(this);
3919     if (vlen == 2) {
3920       __ movdq($dst$$XMMRegister, $src$$Register);
3921       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3922     } else if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3923       int vlen_enc = vector_length_encoding(this);
3924       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
3925     } else if (VM_Version::supports_avx2()) {
3926       assert(vlen == 4, "sanity");
3927       int vlen_enc = vector_length_encoding(this);
3928       __ movdq($dst$$XMMRegister, $src$$Register);
3929       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3930     } else {
3931       assert(vlen == 4, "sanity");
3932       __ movdq($dst$$XMMRegister, $src$$Register);
3933       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3934       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3935     }
3936   %}
3937   ins_pipe( pipe_slow );
3938 %}
3939 #else // _LP64
3940 // Replicate long (8 byte) scalar to be vector
3941 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
3942   predicate(Matcher::vector_length(n) <= 4);
3943   match(Set dst (ReplicateL src));
3944   effect(TEMP dst, USE src, TEMP tmp);
3945   format %{ "replicateL $dst,$src" %}
3946   ins_encode %{
3947     uint vlen = Matcher::vector_length(this);
3948     if (vlen == 2) {
3949       __ movdl($dst$$XMMRegister, $src$$Register);
3950       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3951       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3952       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3953     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3954       int vlen_enc = Assembler::AVX_256bit;
3955       __ movdl($dst$$XMMRegister, $src$$Register);
3956       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3957       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3958       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3959     } else {
3960       __ movdl($dst$$XMMRegister, $src$$Register);
3961       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3962       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3963       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3964       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3965     }
3966   %}
3967   ins_pipe( pipe_slow );
3968 %}
3969 
3970 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
3971   predicate(Matcher::vector_length(n) == 8);
3972   match(Set dst (ReplicateL src));
3973   effect(TEMP dst, USE src, TEMP tmp);
3974   format %{ "replicateL $dst,$src" %}
3975   ins_encode %{
3976     if (VM_Version::supports_avx512vl()) {
3977       __ movdl($dst$$XMMRegister, $src$$Register);
3978       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3979       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3980       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3981       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3982       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3983     } else {
3984       int vlen_enc = Assembler::AVX_512bit;
3985       __ movdl($dst$$XMMRegister, $src$$Register);
3986       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3987       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3988       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3989     }
3990   %}
3991   ins_pipe( pipe_slow );
3992 %}
3993 #endif // _LP64
3994 
3995 instruct ReplL_mem(vec dst, memory mem) %{
3996   match(Set dst (ReplicateL (LoadL mem)));
3997   format %{ "replicateL $dst,$mem" %}
3998   ins_encode %{
3999     uint vlen = Matcher::vector_length(this);
4000     if (vlen == 2) {
4001       __ movq($dst$$XMMRegister, $mem$$Address);
4002       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4003     } else {
4004       assert(VM_Version::supports_avx2(), "sanity");
4005       int vlen_enc = vector_length_encoding(this);
4006       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
4007     }
4008   %}
4009   ins_pipe( pipe_slow );
4010 %}
4011 
4012 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4013 instruct ReplL_imm(vec dst, immL con) %{
4014   match(Set dst (ReplicateL con));
4015   format %{ "replicateL $dst,$con" %}
4016   ins_encode %{
4017     uint vlen = Matcher::vector_length(this);
4018     InternalAddress const_addr = $constantaddress($con);
4019     if (vlen == 2) {
4020       __ movq($dst$$XMMRegister, const_addr);
4021       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4022     } else {
4023       assert(VM_Version::supports_avx2(), "sanity");
4024       int vlen_enc = vector_length_encoding(this);
4025       __ movq($dst$$XMMRegister, const_addr);
4026       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4027     }
4028   %}
4029   ins_pipe( pipe_slow );
4030 %}
4031 
4032 instruct ReplL_zero(vec dst, immL0 zero) %{
4033   match(Set dst (ReplicateL zero));
4034   format %{ "replicateL $dst,$zero" %}
4035   ins_encode %{
4036     int vlen = Matcher::vector_length(this);
4037     if (vlen == 2) {
4038       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4039     } else {
4040       int vlen_enc = vector_length_encoding(this);
4041       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4042     }
4043   %}
4044   ins_pipe( fpu_reg_reg );
4045 %}
4046 
4047 instruct ReplL_M1(vec dst, immL_M1 con) %{
4048   predicate(UseAVX > 0);
4049   match(Set dst (ReplicateL con));
4050   effect(TEMP dst);
4051   format %{ "vallones $dst" %}
4052   ins_encode %{
4053     int vector_len = vector_length_encoding(this);
4054     __ vallones($dst$$XMMRegister, vector_len);
4055   %}
4056   ins_pipe( pipe_slow );
4057 %}
4058 
4059 // ====================ReplicateF=======================================
4060 
4061 instruct ReplF_reg(vec dst, vlRegF src) %{
4062   match(Set dst (ReplicateF src));
4063   format %{ "replicateF $dst,$src" %}
4064   ins_encode %{
4065     uint vlen = Matcher::vector_length(this);
4066     if (vlen <= 4) {
4067       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4068    } else if (VM_Version::supports_avx2()) {
4069       int vlen_enc = vector_length_encoding(this);
4070       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4071     } else {
4072       assert(vlen == 8, "sanity");
4073       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4074       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4075     }
4076   %}
4077   ins_pipe( pipe_slow );
4078 %}
4079 
4080 instruct ReplF_mem(vec dst, memory mem) %{
4081   match(Set dst (ReplicateF (LoadF mem)));
4082   format %{ "replicateF $dst,$mem" %}
4083   ins_encode %{
4084     uint vlen = Matcher::vector_length(this);
4085     if (vlen <= 4) {
4086       __ movdl($dst$$XMMRegister, $mem$$Address);
4087       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4088     } else {
4089       assert(VM_Version::supports_avx(), "sanity");
4090       int vlen_enc = vector_length_encoding(this);
4091       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
4092     }
4093   %}
4094   ins_pipe( pipe_slow );
4095 %}
4096 
4097 instruct ReplF_zero(vec dst, immF0 zero) %{
4098   match(Set dst (ReplicateF zero));
4099   format %{ "replicateF $dst,$zero" %}
4100   ins_encode %{
4101     uint vlen = Matcher::vector_length(this);
4102     if (vlen <= 4) {
4103       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4104     } else {
4105       int vlen_enc = vector_length_encoding(this);
4106       __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4107     }
4108   %}
4109   ins_pipe( fpu_reg_reg );
4110 %}
4111 
4112 // ====================ReplicateD=======================================
4113 
4114 // Replicate double (8 bytes) scalar to be vector
4115 instruct ReplD_reg(vec dst, vlRegD src) %{
4116   match(Set dst (ReplicateD src));
4117   format %{ "replicateD $dst,$src" %}
4118   ins_encode %{
4119     uint vlen = Matcher::vector_length(this);
4120     if (vlen == 2) {
4121       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4122     } else if (VM_Version::supports_avx2()) {
4123       int vlen_enc = vector_length_encoding(this);
4124       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4125     } else {
4126       assert(vlen == 4, "sanity");
4127       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4128       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4129     }
4130   %}
4131   ins_pipe( pipe_slow );
4132 %}
4133 
4134 instruct ReplD_mem(vec dst, memory mem) %{
4135   match(Set dst (ReplicateD (LoadD mem)));
4136   format %{ "replicateD $dst,$mem" %}
4137   ins_encode %{
4138     uint vlen = Matcher::vector_length(this);
4139     if (vlen == 2) {
4140       __ movq($dst$$XMMRegister, $mem$$Address);
4141       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44);
4142     } else {
4143       assert(VM_Version::supports_avx(), "sanity");
4144       int vlen_enc = vector_length_encoding(this);
4145       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4146     }
4147   %}
4148   ins_pipe( pipe_slow );
4149 %}
4150 
4151 instruct ReplD_zero(vec dst, immD0 zero) %{
4152   match(Set dst (ReplicateD zero));
4153   format %{ "replicateD $dst,$zero" %}
4154   ins_encode %{
4155     uint vlen = Matcher::vector_length(this);
4156     if (vlen == 2) {
4157       __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4158     } else {
4159       int vlen_enc = vector_length_encoding(this);
4160       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4161     }
4162   %}
4163   ins_pipe( fpu_reg_reg );
4164 %}
4165 
4166 // ====================VECTOR INSERT=======================================
4167 
4168 instruct insert(vec dst, rRegI val, immU8 idx) %{
4169   predicate(Matcher::vector_length_in_bytes(n) < 32);
4170   match(Set dst (VectorInsert (Binary dst val) idx));
4171   format %{ "vector_insert $dst,$val,$idx" %}
4172   ins_encode %{
4173     assert(UseSSE >= 4, "required");
4174     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
4175 
4176     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4177 
4178     assert(is_integral_type(elem_bt), "");
4179     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4180 
4181     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
4182   %}
4183   ins_pipe( pipe_slow );
4184 %}
4185 
4186 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
4187   predicate(Matcher::vector_length_in_bytes(n) == 32);
4188   match(Set dst (VectorInsert (Binary src val) idx));
4189   effect(TEMP vtmp);
4190   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4191   ins_encode %{
4192     int vlen_enc = Assembler::AVX_256bit;
4193     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4194     int elem_per_lane = 16/type2aelembytes(elem_bt);
4195     int log2epr = log2(elem_per_lane);
4196 
4197     assert(is_integral_type(elem_bt), "sanity");
4198     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4199 
4200     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4201     uint y_idx = ($idx$$constant >> log2epr) & 1;
4202     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4203     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4204     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4205   %}
4206   ins_pipe( pipe_slow );
4207 %}
4208 
4209 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
4210   predicate(Matcher::vector_length_in_bytes(n) == 64);
4211   match(Set dst (VectorInsert (Binary src val) idx));
4212   effect(TEMP vtmp);
4213   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4214   ins_encode %{
4215     assert(UseAVX > 2, "sanity");
4216 
4217     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4218     int elem_per_lane = 16/type2aelembytes(elem_bt);
4219     int log2epr = log2(elem_per_lane);
4220 
4221     assert(is_integral_type(elem_bt), "");
4222     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4223 
4224     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4225     uint y_idx = ($idx$$constant >> log2epr) & 3;
4226     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4227     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4228     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4229   %}
4230   ins_pipe( pipe_slow );
4231 %}
4232 
4233 #ifdef _LP64
4234 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
4235   predicate(Matcher::vector_length(n) == 2);
4236   match(Set dst (VectorInsert (Binary dst val) idx));
4237   format %{ "vector_insert $dst,$val,$idx" %}
4238   ins_encode %{
4239     assert(UseSSE >= 4, "required");
4240     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4241     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4242 
4243     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
4244   %}
4245   ins_pipe( pipe_slow );
4246 %}
4247 
4248 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
4249   predicate(Matcher::vector_length(n) == 4);
4250   match(Set dst (VectorInsert (Binary src val) idx));
4251   effect(TEMP vtmp);
4252   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4253   ins_encode %{
4254     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4255     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4256 
4257     uint x_idx = $idx$$constant & right_n_bits(1);
4258     uint y_idx = ($idx$$constant >> 1) & 1;
4259     int vlen_enc = Assembler::AVX_256bit;
4260     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4261     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4262     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4263   %}
4264   ins_pipe( pipe_slow );
4265 %}
4266 
4267 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
4268   predicate(Matcher::vector_length(n) == 8);
4269   match(Set dst (VectorInsert (Binary src val) idx));
4270   effect(TEMP vtmp);
4271   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4272   ins_encode %{
4273     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
4274     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4275 
4276     uint x_idx = $idx$$constant & right_n_bits(1);
4277     uint y_idx = ($idx$$constant >> 1) & 3;
4278     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4279     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4280     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4281   %}
4282   ins_pipe( pipe_slow );
4283 %}
4284 #endif
4285 
4286 instruct insertF(vec dst, regF val, immU8 idx) %{
4287   predicate(Matcher::vector_length(n) < 8);
4288   match(Set dst (VectorInsert (Binary dst val) idx));
4289   format %{ "vector_insert $dst,$val,$idx" %}
4290   ins_encode %{
4291     assert(UseSSE >= 4, "sanity");
4292 
4293     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4294     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4295 
4296     __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
4297   %}
4298   ins_pipe( pipe_slow );
4299 %}
4300 
4301 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
4302   predicate(Matcher::vector_length(n) >= 8);
4303   match(Set dst (VectorInsert (Binary src val) idx));
4304   effect(TEMP vtmp);
4305   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4306   ins_encode %{
4307     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4308     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4309 
4310     int vlen = Matcher::vector_length(this);
4311     uint x_idx = $idx$$constant & right_n_bits(2);
4312     if (vlen == 8) {
4313       uint y_idx = ($idx$$constant >> 2) & 1;
4314       int vlen_enc = Assembler::AVX_256bit;
4315       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4316       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4317       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4318     } else {
4319       assert(vlen == 16, "sanity");
4320       uint y_idx = ($idx$$constant >> 2) & 3;
4321       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4322       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4323       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4324     }
4325   %}
4326   ins_pipe( pipe_slow );
4327 %}
4328 
4329 #ifdef _LP64
4330 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
4331   predicate(Matcher::vector_length(n) == 2);
4332   match(Set dst (VectorInsert (Binary dst val) idx));
4333   effect(TEMP tmp);
4334   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
4335   ins_encode %{
4336     assert(UseSSE >= 4, "sanity");
4337     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4338     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4339 
4340     __ movq($tmp$$Register, $val$$XMMRegister);
4341     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
4342   %}
4343   ins_pipe( pipe_slow );
4344 %}
4345 
4346 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
4347   predicate(Matcher::vector_length(n) == 4);
4348   match(Set dst (VectorInsert (Binary src val) idx));
4349   effect(TEMP vtmp, TEMP tmp);
4350   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
4351   ins_encode %{
4352     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4353     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4354 
4355     uint x_idx = $idx$$constant & right_n_bits(1);
4356     uint y_idx = ($idx$$constant >> 1) & 1;
4357     int vlen_enc = Assembler::AVX_256bit;
4358     __ movq($tmp$$Register, $val$$XMMRegister);
4359     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4360     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4361     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4362   %}
4363   ins_pipe( pipe_slow );
4364 %}
4365 
4366 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
4367   predicate(Matcher::vector_length(n) == 8);
4368   match(Set dst (VectorInsert (Binary src val) idx));
4369   effect(TEMP tmp, TEMP vtmp);
4370   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4371   ins_encode %{
4372     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4373     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4374 
4375     uint x_idx = $idx$$constant & right_n_bits(1);
4376     uint y_idx = ($idx$$constant >> 1) & 3;
4377     __ movq($tmp$$Register, $val$$XMMRegister);
4378     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4379     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4380     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4381   %}
4382   ins_pipe( pipe_slow );
4383 %}
4384 #endif
4385 
4386 // ====================REDUCTION ARITHMETIC=======================================
4387 
4388 // =======================Int Reduction==========================================
4389 
4390 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4391   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
4392   match(Set dst (AddReductionVI src1 src2));
4393   match(Set dst (MulReductionVI src1 src2));
4394   match(Set dst (AndReductionV  src1 src2));
4395   match(Set dst ( OrReductionV  src1 src2));
4396   match(Set dst (XorReductionV  src1 src2));
4397   match(Set dst (MinReductionV  src1 src2));
4398   match(Set dst (MaxReductionV  src1 src2));
4399   effect(TEMP vtmp1, TEMP vtmp2);
4400   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4401   ins_encode %{
4402     int opcode = this->ideal_Opcode();
4403     int vlen = Matcher::vector_length(this, $src2);
4404     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4405   %}
4406   ins_pipe( pipe_slow );
4407 %}
4408 
4409 // =======================Long Reduction==========================================
4410 
4411 #ifdef _LP64
4412 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4413   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
4414   match(Set dst (AddReductionVL src1 src2));
4415   match(Set dst (MulReductionVL src1 src2));
4416   match(Set dst (AndReductionV  src1 src2));
4417   match(Set dst ( OrReductionV  src1 src2));
4418   match(Set dst (XorReductionV  src1 src2));
4419   match(Set dst (MinReductionV  src1 src2));
4420   match(Set dst (MaxReductionV  src1 src2));
4421   effect(TEMP vtmp1, TEMP vtmp2);
4422   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4423   ins_encode %{
4424     int opcode = this->ideal_Opcode();
4425     int vlen = Matcher::vector_length(this, $src2);
4426     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4427   %}
4428   ins_pipe( pipe_slow );
4429 %}
4430 
4431 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
4432   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
4433   match(Set dst (AddReductionVL src1 src2));
4434   match(Set dst (MulReductionVL src1 src2));
4435   match(Set dst (AndReductionV  src1 src2));
4436   match(Set dst ( OrReductionV  src1 src2));
4437   match(Set dst (XorReductionV  src1 src2));
4438   match(Set dst (MinReductionV  src1 src2));
4439   match(Set dst (MaxReductionV  src1 src2));
4440   effect(TEMP vtmp1, TEMP vtmp2);
4441   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4442   ins_encode %{
4443     int opcode = this->ideal_Opcode();
4444     int vlen = Matcher::vector_length(this, $src2);
4445     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4446   %}
4447   ins_pipe( pipe_slow );
4448 %}
4449 #endif // _LP64
4450 
4451 // =======================Float Reduction==========================================
4452 
4453 instruct reductionF128(regF dst, vec src, vec vtmp) %{
4454   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
4455   match(Set dst (AddReductionVF dst src));
4456   match(Set dst (MulReductionVF dst src));
4457   effect(TEMP dst, TEMP vtmp);
4458   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
4459   ins_encode %{
4460     int opcode = this->ideal_Opcode();
4461     int vlen = Matcher::vector_length(this, $src);
4462     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4463   %}
4464   ins_pipe( pipe_slow );
4465 %}
4466 
4467 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
4468   predicate(Matcher::vector_length(n->in(2)) == 8); // src
4469   match(Set dst (AddReductionVF dst src));
4470   match(Set dst (MulReductionVF dst src));
4471   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4472   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4473   ins_encode %{
4474     int opcode = this->ideal_Opcode();
4475     int vlen = Matcher::vector_length(this, $src);
4476     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4477   %}
4478   ins_pipe( pipe_slow );
4479 %}
4480 
4481 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4482   predicate(Matcher::vector_length(n->in(2)) == 16); // src
4483   match(Set dst (AddReductionVF dst src));
4484   match(Set dst (MulReductionVF dst src));
4485   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4486   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4487   ins_encode %{
4488     int opcode = this->ideal_Opcode();
4489     int vlen = Matcher::vector_length(this, $src);
4490     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4491   %}
4492   ins_pipe( pipe_slow );
4493 %}
4494 
4495 // =======================Double Reduction==========================================
4496 
4497 instruct reduction2D(regD dst, vec src, vec vtmp) %{
4498   predicate(Matcher::vector_length(n->in(2)) == 2); // src
4499   match(Set dst (AddReductionVD dst src));
4500   match(Set dst (MulReductionVD dst src));
4501   effect(TEMP dst, TEMP vtmp);
4502   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
4503   ins_encode %{
4504     int opcode = this->ideal_Opcode();
4505     int vlen = Matcher::vector_length(this, $src);
4506     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4507 %}
4508   ins_pipe( pipe_slow );
4509 %}
4510 
4511 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
4512   predicate(Matcher::vector_length(n->in(2)) == 4); // src
4513   match(Set dst (AddReductionVD dst src));
4514   match(Set dst (MulReductionVD dst src));
4515   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4516   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4517   ins_encode %{
4518     int opcode = this->ideal_Opcode();
4519     int vlen = Matcher::vector_length(this, $src);
4520     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4521   %}
4522   ins_pipe( pipe_slow );
4523 %}
4524 
4525 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4526   predicate(Matcher::vector_length(n->in(2)) == 8); // src
4527   match(Set dst (AddReductionVD dst src));
4528   match(Set dst (MulReductionVD dst src));
4529   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4530   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4531   ins_encode %{
4532     int opcode = this->ideal_Opcode();
4533     int vlen = Matcher::vector_length(this, $src);
4534     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4535   %}
4536   ins_pipe( pipe_slow );
4537 %}
4538 
4539 // =======================Byte Reduction==========================================
4540 
4541 #ifdef _LP64
4542 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4543   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
4544   match(Set dst (AddReductionVI src1 src2));
4545   match(Set dst (AndReductionV  src1 src2));
4546   match(Set dst ( OrReductionV  src1 src2));
4547   match(Set dst (XorReductionV  src1 src2));
4548   match(Set dst (MinReductionV  src1 src2));
4549   match(Set dst (MaxReductionV  src1 src2));
4550   effect(TEMP vtmp1, TEMP vtmp2);
4551   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4552   ins_encode %{
4553     int opcode = this->ideal_Opcode();
4554     int vlen = Matcher::vector_length(this, $src2);
4555     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4556   %}
4557   ins_pipe( pipe_slow );
4558 %}
4559 
4560 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4561   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
4562   match(Set dst (AddReductionVI src1 src2));
4563   match(Set dst (AndReductionV  src1 src2));
4564   match(Set dst ( OrReductionV  src1 src2));
4565   match(Set dst (XorReductionV  src1 src2));
4566   match(Set dst (MinReductionV  src1 src2));
4567   match(Set dst (MaxReductionV  src1 src2));
4568   effect(TEMP vtmp1, TEMP vtmp2);
4569   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4570   ins_encode %{
4571     int opcode = this->ideal_Opcode();
4572     int vlen = Matcher::vector_length(this, $src2);
4573     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4574   %}
4575   ins_pipe( pipe_slow );
4576 %}
4577 #endif
4578 
4579 // =======================Short Reduction==========================================
4580 
4581 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4582   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
4583   match(Set dst (AddReductionVI src1 src2));
4584   match(Set dst (MulReductionVI src1 src2));
4585   match(Set dst (AndReductionV  src1 src2));
4586   match(Set dst ( OrReductionV  src1 src2));
4587   match(Set dst (XorReductionV  src1 src2));
4588   match(Set dst (MinReductionV  src1 src2));
4589   match(Set dst (MaxReductionV  src1 src2));
4590   effect(TEMP vtmp1, TEMP vtmp2);
4591   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4592   ins_encode %{
4593     int opcode = this->ideal_Opcode();
4594     int vlen = Matcher::vector_length(this, $src2);
4595     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4596   %}
4597   ins_pipe( pipe_slow );
4598 %}
4599 
4600 // =======================Mul Reduction==========================================
4601 
4602 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4603   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
4604             Matcher::vector_length(n->in(2)) <= 32); // src2
4605   match(Set dst (MulReductionVI src1 src2));
4606   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4607   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4608   ins_encode %{
4609     int opcode = this->ideal_Opcode();
4610     int vlen = Matcher::vector_length(this, $src2);
4611     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4612   %}
4613   ins_pipe( pipe_slow );
4614 %}
4615 
4616 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4617   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
4618             Matcher::vector_length(n->in(2)) == 64); // src2
4619   match(Set dst (MulReductionVI src1 src2));
4620   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4621   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4622   ins_encode %{
4623     int opcode = this->ideal_Opcode();
4624     int vlen = Matcher::vector_length(this, $src2);
4625     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4626   %}
4627   ins_pipe( pipe_slow );
4628 %}
4629 
4630 //--------------------Min/Max Float Reduction --------------------
4631 // Float Min Reduction
4632 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
4633                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4634   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4635             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4636              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4637             Matcher::vector_length(n->in(2)) == 2);
4638   match(Set dst (MinReductionV src1 src2));
4639   match(Set dst (MaxReductionV src1 src2));
4640   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4641   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4642   ins_encode %{
4643     assert(UseAVX > 0, "sanity");
4644 
4645     int opcode = this->ideal_Opcode();
4646     int vlen = Matcher::vector_length(this, $src2);
4647     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4648                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4649   %}
4650   ins_pipe( pipe_slow );
4651 %}
4652 
4653 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
4654                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4655   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4656             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4657              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4658             Matcher::vector_length(n->in(2)) >= 4);
4659   match(Set dst (MinReductionV src1 src2));
4660   match(Set dst (MaxReductionV src1 src2));
4661   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4662   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4663   ins_encode %{
4664     assert(UseAVX > 0, "sanity");
4665 
4666     int opcode = this->ideal_Opcode();
4667     int vlen = Matcher::vector_length(this, $src2);
4668     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4669                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4670   %}
4671   ins_pipe( pipe_slow );
4672 %}
4673 
4674 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
4675                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4676   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4677             Matcher::vector_length(n->in(2)) == 2);
4678   match(Set dst (MinReductionV dst src));
4679   match(Set dst (MaxReductionV dst src));
4680   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4681   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4682   ins_encode %{
4683     assert(UseAVX > 0, "sanity");
4684 
4685     int opcode = this->ideal_Opcode();
4686     int vlen = Matcher::vector_length(this, $src);
4687     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4688                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4689   %}
4690   ins_pipe( pipe_slow );
4691 %}
4692 
4693 
4694 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
4695                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4696   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4697             Matcher::vector_length(n->in(2)) >= 4);
4698   match(Set dst (MinReductionV dst src));
4699   match(Set dst (MaxReductionV dst src));
4700   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4701   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4702   ins_encode %{
4703     assert(UseAVX > 0, "sanity");
4704 
4705     int opcode = this->ideal_Opcode();
4706     int vlen = Matcher::vector_length(this, $src);
4707     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4708                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4709   %}
4710   ins_pipe( pipe_slow );
4711 %}
4712 
4713 
4714 //--------------------Min Double Reduction --------------------
4715 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
4716                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
4717                             rFlagsReg cr) %{
4718   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4719             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
4720              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
4721             Matcher::vector_length(n->in(2)) == 2);
4722   match(Set dst (MinReductionV src1 src2));
4723   match(Set dst (MaxReductionV src1 src2));
4724   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
4725   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
4726   ins_encode %{
4727     assert(UseAVX > 0, "sanity");
4728 
4729     int opcode = this->ideal_Opcode();
4730     int vlen = Matcher::vector_length(this, $src2);
4731     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
4732                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
4733   %}
4734   ins_pipe( pipe_slow );
4735 %}
4736 
4737 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
4738                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
4739                            rFlagsReg cr) %{
4740   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4741             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
4742              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
4743             Matcher::vector_length(n->in(2)) >= 4);
4744   match(Set dst (MinReductionV src1 src2));
4745   match(Set dst (MaxReductionV src1 src2));
4746   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
4747   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
4748   ins_encode %{
4749     assert(UseAVX > 0, "sanity");
4750 
4751     int opcode = this->ideal_Opcode();
4752     int vlen = Matcher::vector_length(this, $src2);
4753     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
4754                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
4755   %}
4756   ins_pipe( pipe_slow );
4757 %}
4758 
4759 
4760 instruct minmax_reduction2D_av(legRegD dst, legVec src,
4761                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
4762                                rFlagsReg cr) %{
4763   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4764             Matcher::vector_length(n->in(2)) == 2);
4765   match(Set dst (MinReductionV dst src));
4766   match(Set dst (MaxReductionV dst src));
4767   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
4768   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
4769   ins_encode %{
4770     assert(UseAVX > 0, "sanity");
4771 
4772     int opcode = this->ideal_Opcode();
4773     int vlen = Matcher::vector_length(this, $src);
4774     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
4775                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
4776   %}
4777   ins_pipe( pipe_slow );
4778 %}
4779 
4780 instruct minmax_reductionD_av(legRegD dst, legVec src,
4781                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
4782                               rFlagsReg cr) %{
4783   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4784             Matcher::vector_length(n->in(2)) >= 4);
4785   match(Set dst (MinReductionV dst src));
4786   match(Set dst (MaxReductionV dst src));
4787   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
4788   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
4789   ins_encode %{
4790     assert(UseAVX > 0, "sanity");
4791 
4792     int opcode = this->ideal_Opcode();
4793     int vlen = Matcher::vector_length(this, $src);
4794     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
4795                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
4796   %}
4797   ins_pipe( pipe_slow );
4798 %}
4799 
4800 // ====================VECTOR ARITHMETIC=======================================
4801 
4802 // --------------------------------- ADD --------------------------------------
4803 
4804 // Bytes vector add
4805 instruct vaddB(vec dst, vec src) %{
4806   predicate(UseAVX == 0);
4807   match(Set dst (AddVB dst src));
4808   format %{ "paddb   $dst,$src\t! add packedB" %}
4809   ins_encode %{
4810     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
4811   %}
4812   ins_pipe( pipe_slow );
4813 %}
4814 
4815 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
4816   predicate(UseAVX > 0);
4817   match(Set dst (AddVB src1 src2));
4818   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
4819   ins_encode %{
4820     int vlen_enc = vector_length_encoding(this);
4821     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4822   %}
4823   ins_pipe( pipe_slow );
4824 %}
4825 
4826 instruct vaddB_mem(vec dst, vec src, memory mem) %{
4827   predicate((UseAVX > 0) &&
4828             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
4829   match(Set dst (AddVB src (LoadVector mem)));
4830   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
4831   ins_encode %{
4832     int vlen_enc = vector_length_encoding(this);
4833     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4834   %}
4835   ins_pipe( pipe_slow );
4836 %}
4837 
4838 // Shorts/Chars vector add
4839 instruct vaddS(vec dst, vec src) %{
4840   predicate(UseAVX == 0);
4841   match(Set dst (AddVS dst src));
4842   format %{ "paddw   $dst,$src\t! add packedS" %}
4843   ins_encode %{
4844     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
4845   %}
4846   ins_pipe( pipe_slow );
4847 %}
4848 
4849 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
4850   predicate(UseAVX > 0);
4851   match(Set dst (AddVS src1 src2));
4852   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
4853   ins_encode %{
4854     int vlen_enc = vector_length_encoding(this);
4855     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4856   %}
4857   ins_pipe( pipe_slow );
4858 %}
4859 
4860 instruct vaddS_mem(vec dst, vec src, memory mem) %{
4861   predicate((UseAVX > 0) &&
4862             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
4863   match(Set dst (AddVS src (LoadVector mem)));
4864   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
4865   ins_encode %{
4866     int vlen_enc = vector_length_encoding(this);
4867     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4868   %}
4869   ins_pipe( pipe_slow );
4870 %}
4871 
4872 // Integers vector add
4873 instruct vaddI(vec dst, vec src) %{
4874   predicate(UseAVX == 0);
4875   match(Set dst (AddVI dst src));
4876   format %{ "paddd   $dst,$src\t! add packedI" %}
4877   ins_encode %{
4878     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
4879   %}
4880   ins_pipe( pipe_slow );
4881 %}
4882 
4883 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
4884   predicate(UseAVX > 0);
4885   match(Set dst (AddVI src1 src2));
4886   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
4887   ins_encode %{
4888     int vlen_enc = vector_length_encoding(this);
4889     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4890   %}
4891   ins_pipe( pipe_slow );
4892 %}
4893 
4894 
4895 instruct vaddI_mem(vec dst, vec src, memory mem) %{
4896   predicate((UseAVX > 0) &&
4897             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
4898   match(Set dst (AddVI src (LoadVector mem)));
4899   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
4900   ins_encode %{
4901     int vlen_enc = vector_length_encoding(this);
4902     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4903   %}
4904   ins_pipe( pipe_slow );
4905 %}
4906 
4907 // Longs vector add
4908 instruct vaddL(vec dst, vec src) %{
4909   predicate(UseAVX == 0);
4910   match(Set dst (AddVL dst src));
4911   format %{ "paddq   $dst,$src\t! add packedL" %}
4912   ins_encode %{
4913     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
4914   %}
4915   ins_pipe( pipe_slow );
4916 %}
4917 
4918 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
4919   predicate(UseAVX > 0);
4920   match(Set dst (AddVL src1 src2));
4921   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
4922   ins_encode %{
4923     int vlen_enc = vector_length_encoding(this);
4924     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4925   %}
4926   ins_pipe( pipe_slow );
4927 %}
4928 
4929 instruct vaddL_mem(vec dst, vec src, memory mem) %{
4930   predicate((UseAVX > 0) &&
4931             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
4932   match(Set dst (AddVL src (LoadVector mem)));
4933   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
4934   ins_encode %{
4935     int vlen_enc = vector_length_encoding(this);
4936     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4937   %}
4938   ins_pipe( pipe_slow );
4939 %}
4940 
4941 // Floats vector add
4942 instruct vaddF(vec dst, vec src) %{
4943   predicate(UseAVX == 0);
4944   match(Set dst (AddVF dst src));
4945   format %{ "addps   $dst,$src\t! add packedF" %}
4946   ins_encode %{
4947     __ addps($dst$$XMMRegister, $src$$XMMRegister);
4948   %}
4949   ins_pipe( pipe_slow );
4950 %}
4951 
4952 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
4953   predicate(UseAVX > 0);
4954   match(Set dst (AddVF src1 src2));
4955   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
4956   ins_encode %{
4957     int vlen_enc = vector_length_encoding(this);
4958     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4959   %}
4960   ins_pipe( pipe_slow );
4961 %}
4962 
4963 instruct vaddF_mem(vec dst, vec src, memory mem) %{
4964   predicate((UseAVX > 0) &&
4965             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
4966   match(Set dst (AddVF src (LoadVector mem)));
4967   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
4968   ins_encode %{
4969     int vlen_enc = vector_length_encoding(this);
4970     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4971   %}
4972   ins_pipe( pipe_slow );
4973 %}
4974 
4975 // Doubles vector add
4976 instruct vaddD(vec dst, vec src) %{
4977   predicate(UseAVX == 0);
4978   match(Set dst (AddVD dst src));
4979   format %{ "addpd   $dst,$src\t! add packedD" %}
4980   ins_encode %{
4981     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
4982   %}
4983   ins_pipe( pipe_slow );
4984 %}
4985 
4986 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
4987   predicate(UseAVX > 0);
4988   match(Set dst (AddVD src1 src2));
4989   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
4990   ins_encode %{
4991     int vlen_enc = vector_length_encoding(this);
4992     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4993   %}
4994   ins_pipe( pipe_slow );
4995 %}
4996 
4997 instruct vaddD_mem(vec dst, vec src, memory mem) %{
4998   predicate((UseAVX > 0) &&
4999             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5000   match(Set dst (AddVD src (LoadVector mem)));
5001   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
5002   ins_encode %{
5003     int vlen_enc = vector_length_encoding(this);
5004     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5005   %}
5006   ins_pipe( pipe_slow );
5007 %}
5008 
5009 // --------------------------------- SUB --------------------------------------
5010 
5011 // Bytes vector sub
5012 instruct vsubB(vec dst, vec src) %{
5013   predicate(UseAVX == 0);
5014   match(Set dst (SubVB dst src));
5015   format %{ "psubb   $dst,$src\t! sub packedB" %}
5016   ins_encode %{
5017     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
5018   %}
5019   ins_pipe( pipe_slow );
5020 %}
5021 
5022 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
5023   predicate(UseAVX > 0);
5024   match(Set dst (SubVB src1 src2));
5025   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
5026   ins_encode %{
5027     int vlen_enc = vector_length_encoding(this);
5028     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5029   %}
5030   ins_pipe( pipe_slow );
5031 %}
5032 
5033 instruct vsubB_mem(vec dst, vec src, memory mem) %{
5034   predicate((UseAVX > 0) &&
5035             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5036   match(Set dst (SubVB src (LoadVector mem)));
5037   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
5038   ins_encode %{
5039     int vlen_enc = vector_length_encoding(this);
5040     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5041   %}
5042   ins_pipe( pipe_slow );
5043 %}
5044 
5045 // Shorts/Chars vector sub
5046 instruct vsubS(vec dst, vec src) %{
5047   predicate(UseAVX == 0);
5048   match(Set dst (SubVS dst src));
5049   format %{ "psubw   $dst,$src\t! sub packedS" %}
5050   ins_encode %{
5051     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
5052   %}
5053   ins_pipe( pipe_slow );
5054 %}
5055 
5056 
5057 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
5058   predicate(UseAVX > 0);
5059   match(Set dst (SubVS src1 src2));
5060   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
5061   ins_encode %{
5062     int vlen_enc = vector_length_encoding(this);
5063     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5064   %}
5065   ins_pipe( pipe_slow );
5066 %}
5067 
5068 instruct vsubS_mem(vec dst, vec src, memory mem) %{
5069   predicate((UseAVX > 0) &&
5070             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5071   match(Set dst (SubVS src (LoadVector mem)));
5072   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
5073   ins_encode %{
5074     int vlen_enc = vector_length_encoding(this);
5075     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5076   %}
5077   ins_pipe( pipe_slow );
5078 %}
5079 
5080 // Integers vector sub
5081 instruct vsubI(vec dst, vec src) %{
5082   predicate(UseAVX == 0);
5083   match(Set dst (SubVI dst src));
5084   format %{ "psubd   $dst,$src\t! sub packedI" %}
5085   ins_encode %{
5086     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
5087   %}
5088   ins_pipe( pipe_slow );
5089 %}
5090 
5091 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
5092   predicate(UseAVX > 0);
5093   match(Set dst (SubVI src1 src2));
5094   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
5095   ins_encode %{
5096     int vlen_enc = vector_length_encoding(this);
5097     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5098   %}
5099   ins_pipe( pipe_slow );
5100 %}
5101 
5102 instruct vsubI_mem(vec dst, vec src, memory mem) %{
5103   predicate((UseAVX > 0) &&
5104             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5105   match(Set dst (SubVI src (LoadVector mem)));
5106   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
5107   ins_encode %{
5108     int vlen_enc = vector_length_encoding(this);
5109     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5110   %}
5111   ins_pipe( pipe_slow );
5112 %}
5113 
5114 // Longs vector sub
5115 instruct vsubL(vec dst, vec src) %{
5116   predicate(UseAVX == 0);
5117   match(Set dst (SubVL dst src));
5118   format %{ "psubq   $dst,$src\t! sub packedL" %}
5119   ins_encode %{
5120     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
5121   %}
5122   ins_pipe( pipe_slow );
5123 %}
5124 
5125 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
5126   predicate(UseAVX > 0);
5127   match(Set dst (SubVL src1 src2));
5128   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
5129   ins_encode %{
5130     int vlen_enc = vector_length_encoding(this);
5131     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5132   %}
5133   ins_pipe( pipe_slow );
5134 %}
5135 
5136 
5137 instruct vsubL_mem(vec dst, vec src, memory mem) %{
5138   predicate((UseAVX > 0) &&
5139             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5140   match(Set dst (SubVL src (LoadVector mem)));
5141   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
5142   ins_encode %{
5143     int vlen_enc = vector_length_encoding(this);
5144     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5145   %}
5146   ins_pipe( pipe_slow );
5147 %}
5148 
5149 // Floats vector sub
5150 instruct vsubF(vec dst, vec src) %{
5151   predicate(UseAVX == 0);
5152   match(Set dst (SubVF dst src));
5153   format %{ "subps   $dst,$src\t! sub packedF" %}
5154   ins_encode %{
5155     __ subps($dst$$XMMRegister, $src$$XMMRegister);
5156   %}
5157   ins_pipe( pipe_slow );
5158 %}
5159 
5160 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
5161   predicate(UseAVX > 0);
5162   match(Set dst (SubVF src1 src2));
5163   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
5164   ins_encode %{
5165     int vlen_enc = vector_length_encoding(this);
5166     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5167   %}
5168   ins_pipe( pipe_slow );
5169 %}
5170 
5171 instruct vsubF_mem(vec dst, vec src, memory mem) %{
5172   predicate((UseAVX > 0) &&
5173             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5174   match(Set dst (SubVF src (LoadVector mem)));
5175   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
5176   ins_encode %{
5177     int vlen_enc = vector_length_encoding(this);
5178     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5179   %}
5180   ins_pipe( pipe_slow );
5181 %}
5182 
5183 // Doubles vector sub
5184 instruct vsubD(vec dst, vec src) %{
5185   predicate(UseAVX == 0);
5186   match(Set dst (SubVD dst src));
5187   format %{ "subpd   $dst,$src\t! sub packedD" %}
5188   ins_encode %{
5189     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
5190   %}
5191   ins_pipe( pipe_slow );
5192 %}
5193 
5194 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
5195   predicate(UseAVX > 0);
5196   match(Set dst (SubVD src1 src2));
5197   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
5198   ins_encode %{
5199     int vlen_enc = vector_length_encoding(this);
5200     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5201   %}
5202   ins_pipe( pipe_slow );
5203 %}
5204 
5205 instruct vsubD_mem(vec dst, vec src, memory mem) %{
5206   predicate((UseAVX > 0) &&
5207             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5208   match(Set dst (SubVD src (LoadVector mem)));
5209   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
5210   ins_encode %{
5211     int vlen_enc = vector_length_encoding(this);
5212     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5213   %}
5214   ins_pipe( pipe_slow );
5215 %}
5216 
5217 // --------------------------------- MUL --------------------------------------
5218 
5219 // Byte vector mul
5220 instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5221   predicate(Matcher::vector_length(n) == 4 ||
5222             Matcher::vector_length(n) == 8);
5223   match(Set dst (MulVB src1 src2));
5224   effect(TEMP dst, TEMP tmp, TEMP scratch);
5225   format %{"vector_mulB $dst,$src1,$src2" %}
5226   ins_encode %{
5227     assert(UseSSE > 3, "required");
5228     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
5229     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
5230     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
5231     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5232     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5233     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5234   %}
5235   ins_pipe( pipe_slow );
5236 %}
5237 
5238 instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5239   predicate(Matcher::vector_length(n) == 16 && UseAVX <= 1);
5240   match(Set dst (MulVB src1 src2));
5241   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5242   format %{"vector_mulB $dst,$src1,$src2" %}
5243   ins_encode %{
5244     assert(UseSSE > 3, "required");
5245     __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
5246     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
5247     __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
5248     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
5249     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
5250     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
5251     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
5252     __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
5253     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5254     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5255     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5256     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5257   %}
5258   ins_pipe( pipe_slow );
5259 %}
5260 
5261 instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5262   predicate(Matcher::vector_length(n) == 16 && UseAVX > 1);
5263   match(Set dst (MulVB src1 src2));
5264   effect(TEMP dst, TEMP tmp, TEMP scratch);
5265   format %{"vector_mulB $dst,$src1,$src2" %}
5266   ins_encode %{
5267   int vlen_enc = Assembler::AVX_256bit;
5268     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5269     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5270     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5271     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5272     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5273     __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
5274     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
5275   %}
5276   ins_pipe( pipe_slow );
5277 %}
5278 
5279 instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5280   predicate(Matcher::vector_length(n) == 32);
5281   match(Set dst (MulVB src1 src2));
5282   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5283   format %{"vector_mulB $dst,$src1,$src2" %}
5284   ins_encode %{
5285     assert(UseAVX > 1, "required");
5286     int vlen_enc = Assembler::AVX_256bit;
5287     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5288     __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
5289     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5290     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5291     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5292     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5293     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5294     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5295     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5296     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5297     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5298     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5299     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5300     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
5301   %}
5302   ins_pipe( pipe_slow );
5303 %}
5304 
5305 instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5306   predicate(Matcher::vector_length(n) == 64);
5307   match(Set dst (MulVB src1 src2));
5308   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5309   format %{"vector_mulB $dst,$src1,$src2\n\t" %}
5310   ins_encode %{
5311     assert(UseAVX > 2, "required");
5312     int vlen_enc = Assembler::AVX_512bit;
5313     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5314     __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
5315     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5316     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5317     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5318     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5319     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5320     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5321     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5322     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5323     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5324     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5325     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5326     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
5327     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5328   %}
5329   ins_pipe( pipe_slow );
5330 %}
5331 
5332 // Shorts/Chars vector mul
5333 instruct vmulS(vec dst, vec src) %{
5334   predicate(UseAVX == 0);
5335   match(Set dst (MulVS dst src));
5336   format %{ "pmullw $dst,$src\t! mul packedS" %}
5337   ins_encode %{
5338     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
5339   %}
5340   ins_pipe( pipe_slow );
5341 %}
5342 
5343 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
5344   predicate(UseAVX > 0);
5345   match(Set dst (MulVS src1 src2));
5346   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
5347   ins_encode %{
5348     int vlen_enc = vector_length_encoding(this);
5349     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5350   %}
5351   ins_pipe( pipe_slow );
5352 %}
5353 
5354 instruct vmulS_mem(vec dst, vec src, memory mem) %{
5355   predicate((UseAVX > 0) &&
5356             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5357   match(Set dst (MulVS src (LoadVector mem)));
5358   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
5359   ins_encode %{
5360     int vlen_enc = vector_length_encoding(this);
5361     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5362   %}
5363   ins_pipe( pipe_slow );
5364 %}
5365 
5366 // Integers vector mul
5367 instruct vmulI(vec dst, vec src) %{
5368   predicate(UseAVX == 0);
5369   match(Set dst (MulVI dst src));
5370   format %{ "pmulld  $dst,$src\t! mul packedI" %}
5371   ins_encode %{
5372     assert(UseSSE > 3, "required");
5373     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
5374   %}
5375   ins_pipe( pipe_slow );
5376 %}
5377 
5378 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
5379   predicate(UseAVX > 0);
5380   match(Set dst (MulVI src1 src2));
5381   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
5382   ins_encode %{
5383     int vlen_enc = vector_length_encoding(this);
5384     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5385   %}
5386   ins_pipe( pipe_slow );
5387 %}
5388 
5389 instruct vmulI_mem(vec dst, vec src, memory mem) %{
5390   predicate((UseAVX > 0) &&
5391             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5392   match(Set dst (MulVI src (LoadVector mem)));
5393   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
5394   ins_encode %{
5395     int vlen_enc = vector_length_encoding(this);
5396     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5397   %}
5398   ins_pipe( pipe_slow );
5399 %}
5400 
5401 // Longs vector mul
5402 instruct vmulL_reg(vec dst, vec src1, vec src2) %{
5403   predicate(VM_Version::supports_avx512dq());
5404   match(Set dst (MulVL src1 src2));
5405   format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
5406   ins_encode %{
5407     assert(UseAVX > 2, "required");
5408     int vlen_enc = vector_length_encoding(this);
5409     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5410   %}
5411   ins_pipe( pipe_slow );
5412 %}
5413 
5414 instruct vmulL_mem(vec dst, vec src, memory mem) %{
5415   predicate(VM_Version::supports_avx512dq() &&
5416               (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5417   match(Set dst (MulVL src (LoadVector mem)));
5418   format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
5419   ins_encode %{
5420     assert(UseAVX > 2, "required");
5421     int vlen_enc = vector_length_encoding(this);
5422     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5423   %}
5424   ins_pipe( pipe_slow );
5425 %}
5426 
5427 instruct mul2L_reg(vec dst, vec src2, legVec tmp) %{
5428   predicate(Matcher::vector_length(n) == 2 && !VM_Version::supports_avx512dq());
5429   match(Set dst (MulVL dst src2));
5430   effect(TEMP dst, TEMP tmp);
5431   format %{ "pshufd $tmp,$src2, 177\n\t"
5432             "pmulld $tmp,$dst\n\t"
5433             "phaddd $tmp,$tmp\n\t"
5434             "pmovzxdq $tmp,$tmp\n\t"
5435             "psllq $tmp, 32\n\t"
5436             "pmuludq $dst,$src2\n\t"
5437             "paddq $dst,$tmp\n\t! mul packed2L" %}
5438 
5439   ins_encode %{
5440     assert(VM_Version::supports_sse4_1(), "required");
5441     int vlen_enc = Assembler::AVX_128bit;
5442     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
5443     __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
5444     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5445     __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
5446     __ psllq($tmp$$XMMRegister, 32);
5447     __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
5448     __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
5449   %}
5450   ins_pipe( pipe_slow );
5451 %}
5452 
5453 instruct vmul4L_reg_avx(vec dst, vec src1, vec src2, legVec tmp, legVec tmp1) %{
5454   predicate(Matcher::vector_length(n) == 4 && !VM_Version::supports_avx512dq());
5455   match(Set dst (MulVL src1 src2));
5456   effect(TEMP tmp1, TEMP tmp);
5457   format %{ "vpshufd $tmp,$src2\n\t"
5458             "vpmulld $tmp,$src1,$tmp\n\t"
5459             "vphaddd $tmp,$tmp,$tmp\n\t"
5460             "vpmovzxdq $tmp,$tmp\n\t"
5461             "vpsllq $tmp,$tmp\n\t"
5462             "vpmuludq $tmp1,$src1,$src2\n\t"
5463             "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
5464   ins_encode %{
5465     int vlen_enc = Assembler::AVX_256bit;
5466     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vlen_enc);
5467     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5468     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
5469     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5470     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5471     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vlen_enc);
5472     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5473     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5474   %}
5475   ins_pipe( pipe_slow );
5476 %}
5477 
5478 // Floats vector mul
5479 instruct vmulF(vec dst, vec src) %{
5480   predicate(UseAVX == 0);
5481   match(Set dst (MulVF dst src));
5482   format %{ "mulps   $dst,$src\t! mul packedF" %}
5483   ins_encode %{
5484     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
5485   %}
5486   ins_pipe( pipe_slow );
5487 %}
5488 
5489 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
5490   predicate(UseAVX > 0);
5491   match(Set dst (MulVF src1 src2));
5492   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
5493   ins_encode %{
5494     int vlen_enc = vector_length_encoding(this);
5495     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5496   %}
5497   ins_pipe( pipe_slow );
5498 %}
5499 
5500 instruct vmulF_mem(vec dst, vec src, memory mem) %{
5501   predicate((UseAVX > 0) &&
5502             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5503   match(Set dst (MulVF src (LoadVector mem)));
5504   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
5505   ins_encode %{
5506     int vlen_enc = vector_length_encoding(this);
5507     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5508   %}
5509   ins_pipe( pipe_slow );
5510 %}
5511 
5512 // Doubles vector mul
5513 instruct vmulD(vec dst, vec src) %{
5514   predicate(UseAVX == 0);
5515   match(Set dst (MulVD dst src));
5516   format %{ "mulpd   $dst,$src\t! mul packedD" %}
5517   ins_encode %{
5518     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
5519   %}
5520   ins_pipe( pipe_slow );
5521 %}
5522 
5523 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
5524   predicate(UseAVX > 0);
5525   match(Set dst (MulVD src1 src2));
5526   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
5527   ins_encode %{
5528     int vlen_enc = vector_length_encoding(this);
5529     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5530   %}
5531   ins_pipe( pipe_slow );
5532 %}
5533 
5534 instruct vmulD_mem(vec dst, vec src, memory mem) %{
5535   predicate((UseAVX > 0) &&
5536             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5537   match(Set dst (MulVD src (LoadVector mem)));
5538   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
5539   ins_encode %{
5540     int vlen_enc = vector_length_encoding(this);
5541     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5542   %}
5543   ins_pipe( pipe_slow );
5544 %}
5545 
5546 instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5547   predicate(Matcher::vector_length(n) == 8);
5548   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
5549   effect(TEMP dst, USE src1, USE src2);
5550   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
5551             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
5552          %}
5553   ins_encode %{
5554     assert(UseAVX > 0, "required");
5555 
5556     int vlen_enc = Assembler::AVX_256bit;
5557     int cond = (Assembler::Condition)($copnd$$cmpcode);
5558     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5559     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5560   %}
5561   ins_pipe( pipe_slow );
5562 %}
5563 
5564 instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5565   predicate(Matcher::vector_length(n) == 4);
5566   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
5567   effect(TEMP dst, USE src1, USE src2);
5568   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
5569             "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
5570          %}
5571   ins_encode %{
5572     assert(UseAVX > 0, "required");
5573 
5574     int vlen_enc = Assembler::AVX_256bit;
5575     int cond = (Assembler::Condition)($copnd$$cmpcode);
5576     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5577     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5578   %}
5579   ins_pipe( pipe_slow );
5580 %}
5581 
5582 // --------------------------------- DIV --------------------------------------
5583 
5584 // Floats vector div
5585 instruct vdivF(vec dst, vec src) %{
5586   predicate(UseAVX == 0);
5587   match(Set dst (DivVF dst src));
5588   format %{ "divps   $dst,$src\t! div packedF" %}
5589   ins_encode %{
5590     __ divps($dst$$XMMRegister, $src$$XMMRegister);
5591   %}
5592   ins_pipe( pipe_slow );
5593 %}
5594 
5595 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
5596   predicate(UseAVX > 0);
5597   match(Set dst (DivVF src1 src2));
5598   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
5599   ins_encode %{
5600     int vlen_enc = vector_length_encoding(this);
5601     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5602   %}
5603   ins_pipe( pipe_slow );
5604 %}
5605 
5606 instruct vdivF_mem(vec dst, vec src, memory mem) %{
5607   predicate((UseAVX > 0) &&
5608             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5609   match(Set dst (DivVF src (LoadVector mem)));
5610   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
5611   ins_encode %{
5612     int vlen_enc = vector_length_encoding(this);
5613     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5614   %}
5615   ins_pipe( pipe_slow );
5616 %}
5617 
5618 // Doubles vector div
5619 instruct vdivD(vec dst, vec src) %{
5620   predicate(UseAVX == 0);
5621   match(Set dst (DivVD dst src));
5622   format %{ "divpd   $dst,$src\t! div packedD" %}
5623   ins_encode %{
5624     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
5625   %}
5626   ins_pipe( pipe_slow );
5627 %}
5628 
5629 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
5630   predicate(UseAVX > 0);
5631   match(Set dst (DivVD src1 src2));
5632   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
5633   ins_encode %{
5634     int vlen_enc = vector_length_encoding(this);
5635     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5636   %}
5637   ins_pipe( pipe_slow );
5638 %}
5639 
5640 instruct vdivD_mem(vec dst, vec src, memory mem) %{
5641   predicate((UseAVX > 0) &&
5642             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5643   match(Set dst (DivVD src (LoadVector mem)));
5644   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
5645   ins_encode %{
5646     int vlen_enc = vector_length_encoding(this);
5647     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5648   %}
5649   ins_pipe( pipe_slow );
5650 %}
5651 
5652 // ------------------------------ MinMax ---------------------------------------
5653 
5654 // Byte, Short, Int vector Min/Max
5655 instruct minmax_reg_sse(vec dst, vec src) %{
5656   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5657             UseAVX == 0);
5658   match(Set dst (MinV dst src));
5659   match(Set dst (MaxV dst src));
5660   format %{ "vector_minmax  $dst,$src\t!  " %}
5661   ins_encode %{
5662     assert(UseSSE >= 4, "required");
5663 
5664     int opcode = this->ideal_Opcode();
5665     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5666     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
5667   %}
5668   ins_pipe( pipe_slow );
5669 %}
5670 
5671 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
5672   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5673             UseAVX > 0);
5674   match(Set dst (MinV src1 src2));
5675   match(Set dst (MaxV src1 src2));
5676   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
5677   ins_encode %{
5678     int opcode = this->ideal_Opcode();
5679     int vlen_enc = vector_length_encoding(this);
5680     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5681 
5682     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5683   %}
5684   ins_pipe( pipe_slow );
5685 %}
5686 
5687 // Long vector Min/Max
5688 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
5689   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
5690             UseAVX == 0);
5691   match(Set dst (MinV dst src));
5692   match(Set dst (MaxV src dst));
5693   effect(TEMP dst, TEMP tmp);
5694   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
5695   ins_encode %{
5696     assert(UseSSE >= 4, "required");
5697 
5698     int opcode = this->ideal_Opcode();
5699     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5700     assert(elem_bt == T_LONG, "sanity");
5701 
5702     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
5703   %}
5704   ins_pipe( pipe_slow );
5705 %}
5706 
5707 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
5708   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
5709             UseAVX > 0 && !VM_Version::supports_avx512vl());
5710   match(Set dst (MinV src1 src2));
5711   match(Set dst (MaxV src1 src2));
5712   effect(TEMP dst);
5713   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
5714   ins_encode %{
5715     int vlen_enc = vector_length_encoding(this);
5716     int opcode = this->ideal_Opcode();
5717     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5718     assert(elem_bt == T_LONG, "sanity");
5719 
5720     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5721   %}
5722   ins_pipe( pipe_slow );
5723 %}
5724 
5725 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
5726   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
5727             Matcher::vector_element_basic_type(n) == T_LONG);
5728   match(Set dst (MinV src1 src2));
5729   match(Set dst (MaxV src1 src2));
5730   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
5731   ins_encode %{
5732     assert(UseAVX > 2, "required");
5733 
5734     int vlen_enc = vector_length_encoding(this);
5735     int opcode = this->ideal_Opcode();
5736     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5737     assert(elem_bt == T_LONG, "sanity");
5738 
5739     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5740   %}
5741   ins_pipe( pipe_slow );
5742 %}
5743 
5744 // Float/Double vector Min/Max
5745 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
5746   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
5747             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
5748             UseAVX > 0);
5749   match(Set dst (MinV a b));
5750   match(Set dst (MaxV a b));
5751   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
5752   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
5753   ins_encode %{
5754     assert(UseAVX > 0, "required");
5755 
5756     int opcode = this->ideal_Opcode();
5757     int vlen_enc = vector_length_encoding(this);
5758     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5759 
5760     __ vminmax_fp(opcode, elem_bt,
5761                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
5762                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
5763   %}
5764   ins_pipe( pipe_slow );
5765 %}
5766 
5767 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
5768   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
5769             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
5770   match(Set dst (MinV a b));
5771   match(Set dst (MaxV a b));
5772   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
5773   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
5774   ins_encode %{
5775     assert(UseAVX > 2, "required");
5776 
5777     int opcode = this->ideal_Opcode();
5778     int vlen_enc = vector_length_encoding(this);
5779     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5780 
5781     __ evminmax_fp(opcode, elem_bt,
5782                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
5783                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
5784   %}
5785   ins_pipe( pipe_slow );
5786 %}
5787 
5788 // --------------------------------- Signum/CopySign ---------------------------
5789 
5790 instruct signumF_reg(regF dst, regF zero, regF one, rRegP scratch, rFlagsReg cr) %{
5791   match(Set dst (SignumF dst (Binary zero one)));
5792   effect(TEMP scratch, KILL cr);
5793   format %{ "signumF $dst, $dst\t! using $scratch as TEMP" %}
5794   ins_encode %{
5795     int opcode = this->ideal_Opcode();
5796     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
5797   %}
5798   ins_pipe( pipe_slow );
5799 %}
5800 
5801 instruct signumD_reg(regD dst, regD zero, regD one, rRegP scratch, rFlagsReg cr) %{
5802   match(Set dst (SignumD dst (Binary zero one)));
5803   effect(TEMP scratch, KILL cr);
5804   format %{ "signumD $dst, $dst\t! using $scratch as TEMP" %}
5805   ins_encode %{
5806     int opcode = this->ideal_Opcode();
5807     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
5808   %}
5809   ins_pipe( pipe_slow );
5810 %}
5811 
5812 // ---------------------------------------
5813 // For copySign use 0xE4 as writemask for vpternlog
5814 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
5815 // C (xmm2) is set to 0x7FFFFFFF
5816 // Wherever xmm2 is 0, we want to pick from B (sign)
5817 // Wherever xmm2 is 1, we want to pick from A (src)
5818 //
5819 // A B C Result
5820 // 0 0 0 0
5821 // 0 0 1 0
5822 // 0 1 0 1
5823 // 0 1 1 0
5824 // 1 0 0 0
5825 // 1 0 1 1
5826 // 1 1 0 1
5827 // 1 1 1 1
5828 //
5829 // Result going from high bit to low bit is 0x11100100 = 0xe4
5830 // ---------------------------------------
5831 
5832 #ifdef _LP64
5833 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
5834   match(Set dst (CopySignF dst src));
5835   effect(TEMP tmp1, TEMP tmp2);
5836   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
5837   ins_encode %{
5838     __ movl($tmp2$$Register, 0x7FFFFFFF);
5839     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
5840     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
5841   %}
5842   ins_pipe( pipe_slow );
5843 %}
5844 
5845 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
5846   match(Set dst (CopySignD dst (Binary src zero)));
5847   ins_cost(100);
5848   effect(TEMP tmp1, TEMP tmp2);
5849   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
5850   ins_encode %{
5851     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
5852     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
5853     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
5854   %}
5855   ins_pipe( pipe_slow );
5856 %}
5857 #endif // _LP64
5858 
5859 // --------------------------------- Sqrt --------------------------------------
5860 
5861 instruct vsqrtF_reg(vec dst, vec src) %{
5862   match(Set dst (SqrtVF src));
5863   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
5864   ins_encode %{
5865     assert(UseAVX > 0, "required");
5866     int vlen_enc = vector_length_encoding(this);
5867     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5868   %}
5869   ins_pipe( pipe_slow );
5870 %}
5871 
5872 instruct vsqrtF_mem(vec dst, memory mem) %{
5873   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
5874   match(Set dst (SqrtVF (LoadVector mem)));
5875   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
5876   ins_encode %{
5877     assert(UseAVX > 0, "required");
5878     int vlen_enc = vector_length_encoding(this);
5879     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
5880   %}
5881   ins_pipe( pipe_slow );
5882 %}
5883 
5884 // Floating point vector sqrt
5885 instruct vsqrtD_reg(vec dst, vec src) %{
5886   match(Set dst (SqrtVD src));
5887   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
5888   ins_encode %{
5889     assert(UseAVX > 0, "required");
5890     int vlen_enc = vector_length_encoding(this);
5891     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5892   %}
5893   ins_pipe( pipe_slow );
5894 %}
5895 
5896 instruct vsqrtD_mem(vec dst, memory mem) %{
5897   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
5898   match(Set dst (SqrtVD (LoadVector mem)));
5899   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
5900   ins_encode %{
5901     assert(UseAVX > 0, "required");
5902     int vlen_enc = vector_length_encoding(this);
5903     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
5904   %}
5905   ins_pipe( pipe_slow );
5906 %}
5907 
5908 // ------------------------------ Shift ---------------------------------------
5909 
5910 // Left and right shift count vectors are the same on x86
5911 // (only lowest bits of xmm reg are used for count).
5912 instruct vshiftcnt(vec dst, rRegI cnt) %{
5913   match(Set dst (LShiftCntV cnt));
5914   match(Set dst (RShiftCntV cnt));
5915   format %{ "movdl    $dst,$cnt\t! load shift count" %}
5916   ins_encode %{
5917     __ movdl($dst$$XMMRegister, $cnt$$Register);
5918   %}
5919   ins_pipe( pipe_slow );
5920 %}
5921 
5922 // Byte vector shift
5923 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5924   predicate(Matcher::vector_length(n) <= 8 && VectorNode::is_vshift_cnt(n->in(2)));
5925   match(Set dst ( LShiftVB src shift));
5926   match(Set dst ( RShiftVB src shift));
5927   match(Set dst (URShiftVB src shift));
5928   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
5929   format %{"vector_byte_shift $dst,$src,$shift" %}
5930   ins_encode %{
5931     assert(UseSSE > 3, "required");
5932     int opcode = this->ideal_Opcode();
5933     bool sign = (opcode != Op_URShiftVB);
5934     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
5935     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
5936     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5937     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5938     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5939   %}
5940   ins_pipe( pipe_slow );
5941 %}
5942 
5943 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
5944   predicate(Matcher::vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) &&
5945             UseAVX <= 1);
5946   match(Set dst ( LShiftVB src shift));
5947   match(Set dst ( RShiftVB src shift));
5948   match(Set dst (URShiftVB src shift));
5949   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
5950   format %{"vector_byte_shift $dst,$src,$shift" %}
5951   ins_encode %{
5952     assert(UseSSE > 3, "required");
5953     int opcode = this->ideal_Opcode();
5954     bool sign = (opcode != Op_URShiftVB);
5955     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
5956     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
5957     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
5958     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
5959     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
5960     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5961     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5962     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5963     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5964   %}
5965   ins_pipe( pipe_slow );
5966 %}
5967 
5968 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5969   predicate(Matcher::vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) &&
5970             UseAVX > 1);
5971   match(Set dst ( LShiftVB src shift));
5972   match(Set dst ( RShiftVB src shift));
5973   match(Set dst (URShiftVB src shift));
5974   effect(TEMP dst, TEMP tmp, TEMP scratch);
5975   format %{"vector_byte_shift $dst,$src,$shift" %}
5976   ins_encode %{
5977     int opcode = this->ideal_Opcode();
5978     bool sign = (opcode != Op_URShiftVB);
5979     int vlen_enc = Assembler::AVX_256bit;
5980     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
5981     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5982     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5983     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
5984     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
5985   %}
5986   ins_pipe( pipe_slow );
5987 %}
5988 
5989 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5990   predicate(Matcher::vector_length(n) == 32 && VectorNode::is_vshift_cnt(n->in(2)));
5991   match(Set dst ( LShiftVB src shift));
5992   match(Set dst ( RShiftVB src shift));
5993   match(Set dst (URShiftVB src shift));
5994   effect(TEMP dst, TEMP tmp, TEMP scratch);
5995   format %{"vector_byte_shift $dst,$src,$shift" %}
5996   ins_encode %{
5997     assert(UseAVX > 1, "required");
5998     int opcode = this->ideal_Opcode();
5999     bool sign = (opcode != Op_URShiftVB);
6000     int vlen_enc = Assembler::AVX_256bit;
6001     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
6002     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6003     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6004     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6005     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6006     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6007     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6008     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6009     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6010   %}
6011   ins_pipe( pipe_slow );
6012 %}
6013 
6014 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
6015   predicate(Matcher::vector_length(n) == 64 && VectorNode::is_vshift_cnt(n->in(2)));
6016   match(Set dst ( LShiftVB src shift));
6017   match(Set dst  (RShiftVB src shift));
6018   match(Set dst (URShiftVB src shift));
6019   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
6020   format %{"vector_byte_shift $dst,$src,$shift" %}
6021   ins_encode %{
6022     assert(UseAVX > 2, "required");
6023     int opcode = this->ideal_Opcode();
6024     bool sign = (opcode != Op_URShiftVB);
6025     int vlen_enc = Assembler::AVX_512bit;
6026     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
6027     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
6028     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6029     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6030     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6031     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6032     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6033     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6034     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6035     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
6036     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
6037     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6038   %}
6039   ins_pipe( pipe_slow );
6040 %}
6041 
6042 // Shorts vector logical right shift produces incorrect Java result
6043 // for negative data because java code convert short value into int with
6044 // sign extension before a shift. But char vectors are fine since chars are
6045 // unsigned values.
6046 // Shorts/Chars vector left shift
6047 instruct vshiftS(vec dst, vec src, vec shift) %{
6048   predicate(VectorNode::is_vshift_cnt(n->in(2)));
6049   match(Set dst ( LShiftVS src shift));
6050   match(Set dst ( RShiftVS src shift));
6051   match(Set dst (URShiftVS src shift));
6052   effect(TEMP dst, USE src, USE shift);
6053   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
6054   ins_encode %{
6055     int opcode = this->ideal_Opcode();
6056     if (UseAVX > 0) {
6057       int vlen_enc = vector_length_encoding(this);
6058       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6059     } else {
6060       int vlen = Matcher::vector_length(this);
6061       if (vlen == 2) {
6062         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6063         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6064       } else if (vlen == 4) {
6065         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6066         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6067       } else {
6068         assert (vlen == 8, "sanity");
6069         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6070         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6071       }
6072     }
6073   %}
6074   ins_pipe( pipe_slow );
6075 %}
6076 
6077 // Integers vector left shift
6078 instruct vshiftI(vec dst, vec src, vec shift) %{
6079   predicate(VectorNode::is_vshift_cnt(n->in(2)));
6080   match(Set dst ( LShiftVI src shift));
6081   match(Set dst ( RShiftVI src shift));
6082   match(Set dst (URShiftVI src shift));
6083   effect(TEMP dst, USE src, USE shift);
6084   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
6085   ins_encode %{
6086     int opcode = this->ideal_Opcode();
6087     if (UseAVX > 0) {
6088       int vlen_enc = vector_length_encoding(this);
6089       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6090     } else {
6091       int vlen = Matcher::vector_length(this);
6092       if (vlen == 2) {
6093         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6094         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6095       } else {
6096         assert(vlen == 4, "sanity");
6097         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6098         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6099       }
6100     }
6101   %}
6102   ins_pipe( pipe_slow );
6103 %}
6104 
6105 // Integers vector left constant shift
6106 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
6107   match(Set dst (LShiftVI src (LShiftCntV shift)));
6108   match(Set dst (RShiftVI src (RShiftCntV shift)));
6109   match(Set dst (URShiftVI src (RShiftCntV shift)));
6110   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
6111   ins_encode %{
6112     int opcode = this->ideal_Opcode();
6113     if (UseAVX > 0) {
6114       int vector_len = vector_length_encoding(this);
6115       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6116     } else {
6117       int vlen = Matcher::vector_length(this);
6118       if (vlen == 2) {
6119         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6120         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6121       } else {
6122         assert(vlen == 4, "sanity");
6123         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6124         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6125       }
6126     }
6127   %}
6128   ins_pipe( pipe_slow );
6129 %}
6130 
6131 // Longs vector shift
6132 instruct vshiftL(vec dst, vec src, vec shift) %{
6133   predicate(VectorNode::is_vshift_cnt(n->in(2)));
6134   match(Set dst ( LShiftVL src shift));
6135   match(Set dst (URShiftVL src shift));
6136   effect(TEMP dst, USE src, USE shift);
6137   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
6138   ins_encode %{
6139     int opcode = this->ideal_Opcode();
6140     if (UseAVX > 0) {
6141       int vlen_enc = vector_length_encoding(this);
6142       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6143     } else {
6144       assert(Matcher::vector_length(this) == 2, "");
6145       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6146       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6147     }
6148   %}
6149   ins_pipe( pipe_slow );
6150 %}
6151 
6152 // Longs vector constant shift
6153 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
6154   match(Set dst (LShiftVL src (LShiftCntV shift)));
6155   match(Set dst (URShiftVL src (RShiftCntV shift)));
6156   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
6157   ins_encode %{
6158     int opcode = this->ideal_Opcode();
6159     if (UseAVX > 0) {
6160       int vector_len = vector_length_encoding(this);
6161       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6162     } else {
6163       assert(Matcher::vector_length(this) == 2, "");
6164       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6165       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6166     }
6167   %}
6168   ins_pipe( pipe_slow );
6169 %}
6170 
6171 // -------------------ArithmeticRightShift -----------------------------------
6172 // Long vector arithmetic right shift
6173 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6174   predicate(VectorNode::is_vshift_cnt(n->in(2)) && UseAVX <= 2);
6175   match(Set dst (RShiftVL src shift));
6176   effect(TEMP dst, TEMP tmp, TEMP scratch);
6177   format %{ "vshiftq $dst,$src,$shift" %}
6178   ins_encode %{
6179     uint vlen = Matcher::vector_length(this);
6180     if (vlen == 2) {
6181       assert(UseSSE >= 2, "required");
6182       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6183       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
6184       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6185       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
6186       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
6187       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
6188     } else {
6189       assert(vlen == 4, "sanity");
6190       assert(UseAVX > 1, "required");
6191       int vlen_enc = Assembler::AVX_256bit;
6192       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6193       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6194       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6195       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6196       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6197     }
6198   %}
6199   ins_pipe( pipe_slow );
6200 %}
6201 
6202 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
6203   predicate(VectorNode::is_vshift_cnt(n->in(2)) && UseAVX > 2);
6204   match(Set dst (RShiftVL src shift));
6205   format %{ "vshiftq $dst,$src,$shift" %}
6206   ins_encode %{
6207     int vlen_enc = vector_length_encoding(this);
6208     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6209   %}
6210   ins_pipe( pipe_slow );
6211 %}
6212 
6213 // ------------------- Variable Shift -----------------------------
6214 // Byte variable shift
6215 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6216   predicate(Matcher::vector_length(n) <= 8 &&
6217             !VectorNode::is_vshift_cnt(n->in(2)) &&
6218             !VM_Version::supports_avx512bw());
6219   match(Set dst ( LShiftVB src shift));
6220   match(Set dst ( RShiftVB src shift));
6221   match(Set dst (URShiftVB src shift));
6222   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6223   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6224   ins_encode %{
6225     assert(UseAVX >= 2, "required");
6226 
6227     int opcode = this->ideal_Opcode();
6228     int vlen_enc = Assembler::AVX_128bit;
6229     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6230     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6231   %}
6232   ins_pipe( pipe_slow );
6233 %}
6234 
6235 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6236   predicate(Matcher::vector_length(n) == 16 &&
6237             !VectorNode::is_vshift_cnt(n->in(2)) &&
6238             !VM_Version::supports_avx512bw());
6239   match(Set dst ( LShiftVB src shift));
6240   match(Set dst ( RShiftVB src shift));
6241   match(Set dst (URShiftVB src shift));
6242   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6243   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6244   ins_encode %{
6245     assert(UseAVX >= 2, "required");
6246 
6247     int opcode = this->ideal_Opcode();
6248     int vlen_enc = Assembler::AVX_128bit;
6249     // Shift lower half and get word result in dst
6250     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6251 
6252     // Shift upper half and get word result in vtmp1
6253     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6254     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6255     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6256 
6257     // Merge and down convert the two word results to byte in dst
6258     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6259   %}
6260   ins_pipe( pipe_slow );
6261 %}
6262 
6263 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{
6264   predicate(Matcher::vector_length(n) == 32 &&
6265             !VectorNode::is_vshift_cnt(n->in(2)) &&
6266             !VM_Version::supports_avx512bw());
6267   match(Set dst ( LShiftVB src shift));
6268   match(Set dst ( RShiftVB src shift));
6269   match(Set dst (URShiftVB src shift));
6270   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch);
6271   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %}
6272   ins_encode %{
6273     assert(UseAVX >= 2, "required");
6274 
6275     int opcode = this->ideal_Opcode();
6276     int vlen_enc = Assembler::AVX_128bit;
6277     // Process lower 128 bits and get result in dst
6278     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6279     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6280     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6281     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6282     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6283 
6284     // Process higher 128 bits and get result in vtmp3
6285     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6286     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6287     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register);
6288     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
6289     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
6290     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6291     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
6292 
6293     // Merge the two results in dst
6294     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6295   %}
6296   ins_pipe( pipe_slow );
6297 %}
6298 
6299 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6300   predicate(Matcher::vector_length(n) <= 32 &&
6301             !VectorNode::is_vshift_cnt(n->in(2)) &&
6302             VM_Version::supports_avx512bw());
6303   match(Set dst ( LShiftVB src shift));
6304   match(Set dst ( RShiftVB src shift));
6305   match(Set dst (URShiftVB src shift));
6306   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6307   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6308   ins_encode %{
6309     assert(UseAVX > 2, "required");
6310 
6311     int opcode = this->ideal_Opcode();
6312     int vlen_enc = vector_length_encoding(this);
6313     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6314   %}
6315   ins_pipe( pipe_slow );
6316 %}
6317 
6318 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6319   predicate(Matcher::vector_length(n) == 64 &&
6320             !VectorNode::is_vshift_cnt(n->in(2)) &&
6321             VM_Version::supports_avx512bw());
6322   match(Set dst ( LShiftVB src shift));
6323   match(Set dst ( RShiftVB src shift));
6324   match(Set dst (URShiftVB src shift));
6325   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6326   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6327   ins_encode %{
6328     assert(UseAVX > 2, "required");
6329 
6330     int opcode = this->ideal_Opcode();
6331     int vlen_enc = Assembler::AVX_256bit;
6332     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6333     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6334     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6335     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6336     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6337   %}
6338   ins_pipe( pipe_slow );
6339 %}
6340 
6341 // Short variable shift
6342 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6343   predicate(Matcher::vector_length(n) <= 8 &&
6344             !VectorNode::is_vshift_cnt(n->in(2)) &&
6345             !VM_Version::supports_avx512bw());
6346   match(Set dst ( LShiftVS src shift));
6347   match(Set dst ( RShiftVS src shift));
6348   match(Set dst (URShiftVS src shift));
6349   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6350   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6351   ins_encode %{
6352     assert(UseAVX >= 2, "required");
6353 
6354     int opcode = this->ideal_Opcode();
6355     bool sign = (opcode != Op_URShiftVS);
6356     int vlen_enc = Assembler::AVX_256bit;
6357     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
6358     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
6359     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
6360     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6361     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
6362     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6363   %}
6364   ins_pipe( pipe_slow );
6365 %}
6366 
6367 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6368   predicate(Matcher::vector_length(n) == 16 &&
6369             !VectorNode::is_vshift_cnt(n->in(2)) &&
6370             !VM_Version::supports_avx512bw());
6371   match(Set dst ( LShiftVS src shift));
6372   match(Set dst ( RShiftVS src shift));
6373   match(Set dst (URShiftVS src shift));
6374   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6375   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6376   ins_encode %{
6377     assert(UseAVX >= 2, "required");
6378 
6379     int opcode = this->ideal_Opcode();
6380     bool sign = (opcode != Op_URShiftVS);
6381     int vlen_enc = Assembler::AVX_256bit;
6382     // Shift lower half, with result in vtmp2 usign vtmp1 as TEMP
6383     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6384     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6385     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6386     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6387 
6388     // Shift upper half, with result in dst usign vtmp1 as TEMP
6389     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
6390     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
6391     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6392     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6393     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6394     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6395 
6396     // Merge lower and upper half result into dst
6397     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6398     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6399   %}
6400   ins_pipe( pipe_slow );
6401 %}
6402 
6403 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
6404   predicate(!VectorNode::is_vshift_cnt(n->in(2)) &&
6405             VM_Version::supports_avx512bw());
6406   match(Set dst ( LShiftVS src shift));
6407   match(Set dst ( RShiftVS src shift));
6408   match(Set dst (URShiftVS src shift));
6409   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
6410   ins_encode %{
6411     assert(UseAVX > 2, "required");
6412 
6413     int opcode = this->ideal_Opcode();
6414     int vlen_enc = vector_length_encoding(this);
6415     if (!VM_Version::supports_avx512vl()) {
6416       vlen_enc = Assembler::AVX_512bit;
6417     }
6418     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6419   %}
6420   ins_pipe( pipe_slow );
6421 %}
6422 
6423 //Integer variable shift
6424 instruct vshiftI_var(vec dst, vec src, vec shift) %{
6425   predicate(!VectorNode::is_vshift_cnt(n->in(2)));
6426   match(Set dst ( LShiftVI src shift));
6427   match(Set dst ( RShiftVI src shift));
6428   match(Set dst (URShiftVI src shift));
6429   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
6430   ins_encode %{
6431     assert(UseAVX >= 2, "required");
6432 
6433     int opcode = this->ideal_Opcode();
6434     int vlen_enc = vector_length_encoding(this);
6435     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6436   %}
6437   ins_pipe( pipe_slow );
6438 %}
6439 
6440 //Long variable shift
6441 instruct vshiftL_var(vec dst, vec src, vec shift) %{
6442   predicate(!VectorNode::is_vshift_cnt(n->in(2)));
6443   match(Set dst ( LShiftVL src shift));
6444   match(Set dst (URShiftVL src shift));
6445   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
6446   ins_encode %{
6447     assert(UseAVX >= 2, "required");
6448 
6449     int opcode = this->ideal_Opcode();
6450     int vlen_enc = vector_length_encoding(this);
6451     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6452   %}
6453   ins_pipe( pipe_slow );
6454 %}
6455 
6456 //Long variable right shift arithmetic
6457 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
6458   predicate(Matcher::vector_length(n) <= 4 &&
6459             !VectorNode::is_vshift_cnt(n->in(2)) &&
6460             UseAVX == 2);
6461   match(Set dst (RShiftVL src shift));
6462   effect(TEMP dst, TEMP vtmp);
6463   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
6464   ins_encode %{
6465     int opcode = this->ideal_Opcode();
6466     int vlen_enc = vector_length_encoding(this);
6467     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
6468                  $vtmp$$XMMRegister);
6469   %}
6470   ins_pipe( pipe_slow );
6471 %}
6472 
6473 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
6474   predicate(!VectorNode::is_vshift_cnt(n->in(2)) &&
6475             UseAVX > 2);
6476   match(Set dst (RShiftVL src shift));
6477   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
6478   ins_encode %{
6479     int opcode = this->ideal_Opcode();
6480     int vlen_enc = vector_length_encoding(this);
6481     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6482   %}
6483   ins_pipe( pipe_slow );
6484 %}
6485 
6486 // --------------------------------- AND --------------------------------------
6487 
6488 instruct vand(vec dst, vec src) %{
6489   predicate(UseAVX == 0);
6490   match(Set dst (AndV dst src));
6491   format %{ "pand    $dst,$src\t! and vectors" %}
6492   ins_encode %{
6493     __ pand($dst$$XMMRegister, $src$$XMMRegister);
6494   %}
6495   ins_pipe( pipe_slow );
6496 %}
6497 
6498 instruct vand_reg(vec dst, vec src1, vec src2) %{
6499   predicate(UseAVX > 0);
6500   match(Set dst (AndV src1 src2));
6501   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
6502   ins_encode %{
6503     int vlen_enc = vector_length_encoding(this);
6504     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6505   %}
6506   ins_pipe( pipe_slow );
6507 %}
6508 
6509 instruct vand_mem(vec dst, vec src, memory mem) %{
6510   predicate((UseAVX > 0) &&
6511             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6512   match(Set dst (AndV src (LoadVector mem)));
6513   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
6514   ins_encode %{
6515     int vlen_enc = vector_length_encoding(this);
6516     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6517   %}
6518   ins_pipe( pipe_slow );
6519 %}
6520 
6521 // --------------------------------- OR ---------------------------------------
6522 
6523 instruct vor(vec dst, vec src) %{
6524   predicate(UseAVX == 0);
6525   match(Set dst (OrV dst src));
6526   format %{ "por     $dst,$src\t! or vectors" %}
6527   ins_encode %{
6528     __ por($dst$$XMMRegister, $src$$XMMRegister);
6529   %}
6530   ins_pipe( pipe_slow );
6531 %}
6532 
6533 instruct vor_reg(vec dst, vec src1, vec src2) %{
6534   predicate(UseAVX > 0);
6535   match(Set dst (OrV src1 src2));
6536   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
6537   ins_encode %{
6538     int vlen_enc = vector_length_encoding(this);
6539     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6540   %}
6541   ins_pipe( pipe_slow );
6542 %}
6543 
6544 instruct vor_mem(vec dst, vec src, memory mem) %{
6545   predicate((UseAVX > 0) &&
6546             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6547   match(Set dst (OrV src (LoadVector mem)));
6548   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
6549   ins_encode %{
6550     int vlen_enc = vector_length_encoding(this);
6551     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6552   %}
6553   ins_pipe( pipe_slow );
6554 %}
6555 
6556 // --------------------------------- XOR --------------------------------------
6557 
6558 instruct vxor(vec dst, vec src) %{
6559   predicate(UseAVX == 0);
6560   match(Set dst (XorV dst src));
6561   format %{ "pxor    $dst,$src\t! xor vectors" %}
6562   ins_encode %{
6563     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
6564   %}
6565   ins_pipe( pipe_slow );
6566 %}
6567 
6568 instruct vxor_reg(vec dst, vec src1, vec src2) %{
6569   predicate(UseAVX > 0);
6570   match(Set dst (XorV src1 src2));
6571   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
6572   ins_encode %{
6573     int vlen_enc = vector_length_encoding(this);
6574     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6575   %}
6576   ins_pipe( pipe_slow );
6577 %}
6578 
6579 instruct vxor_mem(vec dst, vec src, memory mem) %{
6580   predicate((UseAVX > 0) &&
6581             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6582   match(Set dst (XorV src (LoadVector mem)));
6583   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
6584   ins_encode %{
6585     int vlen_enc = vector_length_encoding(this);
6586     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6587   %}
6588   ins_pipe( pipe_slow );
6589 %}
6590 
6591 // --------------------------------- VectorCast --------------------------------------
6592 
6593 instruct vcastBtoX(vec dst, vec src) %{
6594   match(Set dst (VectorCastB2X src));
6595   format %{ "vector_cast_b2x $dst,$src\t!" %}
6596   ins_encode %{
6597     assert(UseAVX > 0, "required");
6598 
6599     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6600     int vlen_enc = vector_length_encoding(this);
6601     switch (to_elem_bt) {
6602       case T_SHORT:
6603         __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6604         break;
6605       case T_INT:
6606         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6607         break;
6608       case T_FLOAT:
6609         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6610         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6611         break;
6612       case T_LONG:
6613         __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6614         break;
6615       case T_DOUBLE:
6616         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6617         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6618         break;
6619 
6620       default: assert(false, "%s", type2name(to_elem_bt));
6621     }
6622   %}
6623   ins_pipe( pipe_slow );
6624 %}
6625 
6626 instruct castStoX(vec dst, vec src, rRegP scratch) %{
6627   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6628             Matcher::vector_length(n->in(1)) <= 8 && // src
6629             Matcher::vector_element_basic_type(n) == T_BYTE);
6630   effect(TEMP scratch);
6631   match(Set dst (VectorCastS2X src));
6632   format %{ "vector_cast_s2x $dst,$src\t! using $scratch as TEMP" %}
6633   ins_encode %{
6634     assert(UseAVX > 0, "required");
6635 
6636     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, $scratch$$Register);
6637     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6638   %}
6639   ins_pipe( pipe_slow );
6640 %}
6641 
6642 instruct vcastStoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6643   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6644             Matcher::vector_length(n->in(1)) == 16 && // src
6645             Matcher::vector_element_basic_type(n) == T_BYTE);
6646   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6647   match(Set dst (VectorCastS2X src));
6648   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp, $scratch as TEMP" %}
6649   ins_encode %{
6650     assert(UseAVX > 0, "required");
6651 
6652     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
6653     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6654     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
6655     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6656   %}
6657   ins_pipe( pipe_slow );
6658 %}
6659 
6660 instruct vcastStoX_evex(vec dst, vec src) %{
6661   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
6662             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
6663   match(Set dst (VectorCastS2X src));
6664   format %{ "vector_cast_s2x $dst,$src\t!" %}
6665   ins_encode %{
6666     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6667     int src_vlen_enc = vector_length_encoding(this, $src);
6668     int vlen_enc = vector_length_encoding(this);
6669     switch (to_elem_bt) {
6670       case T_BYTE:
6671         if (!VM_Version::supports_avx512vl()) {
6672           vlen_enc = Assembler::AVX_512bit;
6673         }
6674         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6675         break;
6676       case T_INT:
6677         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6678         break;
6679       case T_FLOAT:
6680         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6681         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6682         break;
6683       case T_LONG:
6684         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6685         break;
6686       case T_DOUBLE:
6687         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6688         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6689         break;
6690       default:
6691         ShouldNotReachHere();
6692     }
6693   %}
6694   ins_pipe( pipe_slow );
6695 %}
6696 
6697 instruct castItoX(vec dst, vec src, rRegP scratch) %{
6698   predicate(UseAVX <= 2 &&
6699             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
6700             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
6701   match(Set dst (VectorCastI2X src));
6702   format %{ "vector_cast_i2x $dst,$src\t! using $scratch as TEMP" %}
6703   effect(TEMP scratch);
6704   ins_encode %{
6705     assert(UseAVX > 0, "required");
6706 
6707     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6708     int vlen_enc = vector_length_encoding(this, $src);
6709 
6710     if (to_elem_bt == T_BYTE) {
6711       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
6712       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6713       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6714     } else {
6715       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
6716       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6717       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6718     }
6719   %}
6720   ins_pipe( pipe_slow );
6721 %}
6722 
6723 instruct vcastItoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6724   predicate(UseAVX <= 2 &&
6725             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
6726             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
6727   match(Set dst (VectorCastI2X src));
6728   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp and $scratch as TEMP" %}
6729   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6730   ins_encode %{
6731     assert(UseAVX > 0, "required");
6732 
6733     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6734     int vlen_enc = vector_length_encoding(this, $src);
6735 
6736     if (to_elem_bt == T_BYTE) {
6737       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
6738       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
6739       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6740       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6741     } else {
6742       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
6743       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6744       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
6745       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6746     }
6747   %}
6748   ins_pipe( pipe_slow );
6749 %}
6750 
6751 instruct vcastItoX_evex(vec dst, vec src) %{
6752   predicate(UseAVX > 2 ||
6753             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
6754   match(Set dst (VectorCastI2X src));
6755   format %{ "vector_cast_i2x $dst,$src\t!" %}
6756   ins_encode %{
6757     assert(UseAVX > 0, "required");
6758 
6759     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
6760     int src_vlen_enc = vector_length_encoding(this, $src);
6761     int dst_vlen_enc = vector_length_encoding(this);
6762     switch (dst_elem_bt) {
6763       case T_BYTE:
6764         if (!VM_Version::supports_avx512vl()) {
6765           src_vlen_enc = Assembler::AVX_512bit;
6766         }
6767         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6768         break;
6769       case T_SHORT:
6770         if (!VM_Version::supports_avx512vl()) {
6771           src_vlen_enc = Assembler::AVX_512bit;
6772         }
6773         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6774         break;
6775       case T_FLOAT:
6776         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
6777         break;
6778       case T_LONG:
6779         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
6780         break;
6781       case T_DOUBLE:
6782         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
6783         break;
6784       default:
6785         ShouldNotReachHere();
6786     }
6787   %}
6788   ins_pipe( pipe_slow );
6789 %}
6790 
6791 instruct vcastLtoBS(vec dst, vec src, rRegP scratch) %{
6792   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
6793             UseAVX <= 2);
6794   match(Set dst (VectorCastL2X src));
6795   effect(TEMP scratch);
6796   format %{ "vector_cast_l2x  $dst,$src\t! using $scratch as TEMP" %}
6797   ins_encode %{
6798     assert(UseAVX > 0, "required");
6799 
6800     int vlen = Matcher::vector_length_in_bytes(this, $src);
6801     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
6802     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
6803                                                       : ExternalAddress(vector_int_to_short_mask());
6804     if (vlen <= 16) {
6805       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
6806       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
6807       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6808     } else {
6809       assert(vlen <= 32, "required");
6810       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
6811       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
6812       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
6813       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6814     }
6815     if (to_elem_bt == T_BYTE) {
6816       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6817     }
6818   %}
6819   ins_pipe( pipe_slow );
6820 %}
6821 
6822 instruct vcastLtoX_evex(vec dst, vec src) %{
6823   predicate(UseAVX > 2 ||
6824             (Matcher::vector_element_basic_type(n) == T_INT ||
6825              Matcher::vector_element_basic_type(n) == T_FLOAT ||
6826              Matcher::vector_element_basic_type(n) == T_DOUBLE));
6827   match(Set dst (VectorCastL2X src));
6828   format %{ "vector_cast_l2x  $dst,$src\t!" %}
6829   ins_encode %{
6830     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6831     int vlen = Matcher::vector_length_in_bytes(this, $src);
6832     int vlen_enc = vector_length_encoding(this, $src);
6833     switch (to_elem_bt) {
6834       case T_BYTE:
6835         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
6836           vlen_enc = Assembler::AVX_512bit;
6837         }
6838         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6839         break;
6840       case T_SHORT:
6841         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
6842           vlen_enc = Assembler::AVX_512bit;
6843         }
6844         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6845         break;
6846       case T_INT:
6847         if (vlen == 8) {
6848           if ($dst$$XMMRegister != $src$$XMMRegister) {
6849             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6850           }
6851         } else if (vlen == 16) {
6852           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
6853         } else if (vlen == 32) {
6854           if (UseAVX > 2) {
6855             if (!VM_Version::supports_avx512vl()) {
6856               vlen_enc = Assembler::AVX_512bit;
6857             }
6858             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6859           } else {
6860             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
6861             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
6862           }
6863         } else { // vlen == 64
6864           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6865         }
6866         break;
6867       case T_FLOAT:
6868         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
6869         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6870         break;
6871       case T_DOUBLE:
6872         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
6873         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6874         break;
6875 
6876       default: assert(false, "%s", type2name(to_elem_bt));
6877     }
6878   %}
6879   ins_pipe( pipe_slow );
6880 %}
6881 
6882 instruct vcastFtoD_reg(vec dst, vec src) %{
6883   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
6884   match(Set dst (VectorCastF2X src));
6885   format %{ "vector_cast_f2x  $dst,$src\t!" %}
6886   ins_encode %{
6887     int vlen_enc = vector_length_encoding(this);
6888     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6889   %}
6890   ins_pipe( pipe_slow );
6891 %}
6892 
6893 instruct vcastDtoF_reg(vec dst, vec src) %{
6894   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
6895   match(Set dst (VectorCastD2X src));
6896   format %{ "vector_cast_d2x  $dst,$src\t!" %}
6897   ins_encode %{
6898     int vlen_enc = vector_length_encoding(this, $src);
6899     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6900   %}
6901   ins_pipe( pipe_slow );
6902 %}
6903 
6904 // --------------------------------- VectorMaskCmp --------------------------------------
6905 
6906 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
6907   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
6908             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
6909             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
6910   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6911   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
6912   ins_encode %{
6913     int vlen_enc = vector_length_encoding(this, $src1);
6914     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
6915     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
6916       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6917     } else {
6918       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6919     }
6920   %}
6921   ins_pipe( pipe_slow );
6922 %}
6923 
6924 instruct evcmpFD(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
6925   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
6926             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
6927   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6928   effect(TEMP scratch, TEMP ktmp);
6929   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6930   ins_encode %{
6931     int vlen_enc = Assembler::AVX_512bit;
6932     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
6933     KRegister mask = k0; // The comparison itself is not being masked.
6934     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
6935       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6936       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
6937     } else {
6938       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6939       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
6940     }
6941   %}
6942   ins_pipe( pipe_slow );
6943 %}
6944 
6945 instruct vcmp(legVec dst, legVec src1, legVec src2, immI8 cond, rRegP scratch) %{
6946   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vl()) &&
6947             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
6948             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
6949             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
6950             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
6951   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6952   effect(TEMP scratch);
6953   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6954   ins_encode %{
6955     int vlen_enc = vector_length_encoding(this, $src1);
6956     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
6957     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
6958     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, ww, vlen_enc, $scratch$$Register);
6959   %}
6960   ins_pipe( pipe_slow );
6961 %}
6962 
6963 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
6964   predicate((UseAVX == 2 || !VM_Version::supports_avx512vl()) &&
6965             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
6966             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
6967             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 16 && // src1
6968             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
6969   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6970   effect(TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6971   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6972   ins_encode %{
6973     int vlen = Matcher::vector_length_in_bytes(this, $src1);
6974     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
6975     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
6976     __ vpcmpu(bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen, $vtmp1$$XMMRegister,
6977               $vtmp2$$XMMRegister, $scratch$$Register);
6978   %}
6979   ins_pipe( pipe_slow );
6980 %}
6981 
6982 instruct vcmpu32(legVec dst, legVec src1, legVec src2, immI8 cond, legVec vtmp1, legVec vtmp2, legVec vtmp3, rRegP scratch) %{
6983   predicate((UseAVX == 2 || !VM_Version::supports_avx512vl()) &&
6984             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
6985             Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 32 && // src1
6986             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
6987   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6988   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP scratch);
6989   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6990   ins_encode %{
6991     int vlen = Matcher::vector_length_in_bytes(this, $src1);
6992     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
6993     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
6994     __ vpcmpu32(bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen, $vtmp1$$XMMRegister,
6995                 $vtmp2$$XMMRegister, $vtmp3$$XMMRegister, $scratch$$Register);
6996   %}
6997   ins_pipe( pipe_slow );
6998 %}
6999 
7000 instruct evcmp(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
7001   predicate(UseAVX > 2 &&
7002             (VM_Version::supports_avx512vl() ||
7003              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
7004              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7005   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7006   effect(TEMP scratch, TEMP ktmp);
7007   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7008   ins_encode %{
7009     assert(UseAVX > 2, "required");
7010 
7011     int vlen_enc = vector_length_encoding(this, $src1);
7012     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7013     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
7014     KRegister mask = k0; // The comparison itself is not being masked.
7015     bool merge = false;
7016     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
7017 
7018     switch (src1_elem_bt) {
7019       case T_BYTE: {
7020         __ evpcmpb($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7021         __ evmovdqub($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7022         break;
7023       }
7024       case T_SHORT: {
7025         __ evpcmpw($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7026         __ evmovdquw($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7027         break;
7028       }
7029       case T_INT: {
7030         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7031         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7032         break;
7033       }
7034       case T_LONG: {
7035         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7036         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7037         break;
7038       }
7039       default: assert(false, "%s", type2name(src1_elem_bt));
7040     }
7041   %}
7042   ins_pipe( pipe_slow );
7043 %}
7044 
7045 // Extract
7046 
7047 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
7048   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
7049   match(Set dst (ExtractI src idx));
7050   match(Set dst (ExtractS src idx));
7051 #ifdef _LP64
7052   match(Set dst (ExtractB src idx));
7053 #endif
7054   format %{ "extractI $dst,$src,$idx\t!" %}
7055   ins_encode %{
7056     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7057 
7058     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7059     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7060   %}
7061   ins_pipe( pipe_slow );
7062 %}
7063 
7064 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
7065   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
7066             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
7067   match(Set dst (ExtractI src idx));
7068   match(Set dst (ExtractS src idx));
7069 #ifdef _LP64
7070   match(Set dst (ExtractB src idx));
7071 #endif
7072   effect(TEMP vtmp);
7073   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
7074   ins_encode %{
7075     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7076 
7077     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7078     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7079     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
7080   %}
7081   ins_pipe( pipe_slow );
7082 %}
7083 
7084 #ifdef _LP64
7085 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
7086   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
7087   match(Set dst (ExtractL src idx));
7088   format %{ "extractL $dst,$src,$idx\t!" %}
7089   ins_encode %{
7090     assert(UseSSE >= 4, "required");
7091     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7092 
7093     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7094   %}
7095   ins_pipe( pipe_slow );
7096 %}
7097 
7098 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
7099   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7100             Matcher::vector_length(n->in(1)) == 8);  // src
7101   match(Set dst (ExtractL src idx));
7102   effect(TEMP vtmp);
7103   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
7104   ins_encode %{
7105     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7106 
7107     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7108     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
7109   %}
7110   ins_pipe( pipe_slow );
7111 %}
7112 #endif
7113 
7114 instruct extractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7115   predicate(Matcher::vector_length(n->in(1)) <= 4);
7116   match(Set dst (ExtractF src idx));
7117   effect(TEMP dst, TEMP tmp, TEMP vtmp);
7118   format %{ "extractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7119   ins_encode %{
7120     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7121 
7122     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $tmp$$Register, $vtmp$$XMMRegister);
7123   %}
7124   ins_pipe( pipe_slow );
7125 %}
7126 
7127 instruct vextractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7128   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
7129             Matcher::vector_length(n->in(1)/*src*/) == 16);
7130   match(Set dst (ExtractF src idx));
7131   effect(TEMP tmp, TEMP vtmp);
7132   format %{ "vextractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7133   ins_encode %{
7134     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7135 
7136     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7137     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant, $tmp$$Register);
7138   %}
7139   ins_pipe( pipe_slow );
7140 %}
7141 
7142 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
7143   predicate(Matcher::vector_length(n->in(1)) == 2); // src
7144   match(Set dst (ExtractD src idx));
7145   format %{ "extractD $dst,$src,$idx\t!" %}
7146   ins_encode %{
7147     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7148 
7149     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7150   %}
7151   ins_pipe( pipe_slow );
7152 %}
7153 
7154 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
7155   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7156             Matcher::vector_length(n->in(1)) == 8);  // src
7157   match(Set dst (ExtractD src idx));
7158   effect(TEMP vtmp);
7159   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
7160   ins_encode %{
7161     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7162 
7163     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7164     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
7165   %}
7166   ins_pipe( pipe_slow );
7167 %}
7168 
7169 // --------------------------------- Vector Blend --------------------------------------
7170 
7171 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
7172   predicate(UseAVX == 0);
7173   match(Set dst (VectorBlend (Binary dst src) mask));
7174   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
7175   effect(TEMP tmp);
7176   ins_encode %{
7177     assert(UseSSE >= 4, "required");
7178 
7179     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
7180       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
7181     }
7182     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
7183   %}
7184   ins_pipe( pipe_slow );
7185 %}
7186 
7187 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
7188   predicate(UseAVX > 0 &&
7189             Matcher::vector_length_in_bytes(n) <= 32 &&
7190             is_integral_type(Matcher::vector_element_basic_type(n)));
7191   match(Set dst (VectorBlend (Binary src1 src2) mask));
7192   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7193   ins_encode %{
7194     int vlen_enc = vector_length_encoding(this);
7195     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7196   %}
7197   ins_pipe( pipe_slow );
7198 %}
7199 
7200 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
7201   predicate(UseAVX > 0 &&
7202             Matcher::vector_length_in_bytes(n) <= 32 &&
7203             !is_integral_type(Matcher::vector_element_basic_type(n)));
7204   match(Set dst (VectorBlend (Binary src1 src2) mask));
7205   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7206   ins_encode %{
7207     int vlen_enc = vector_length_encoding(this);
7208     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7209   %}
7210   ins_pipe( pipe_slow );
7211 %}
7212 
7213 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, rRegP scratch, kReg ktmp) %{
7214   predicate(Matcher::vector_length_in_bytes(n) == 64);
7215   match(Set dst (VectorBlend (Binary src1 src2) mask));
7216   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
7217   effect(TEMP scratch, TEMP ktmp);
7218   ins_encode %{
7219      int vlen_enc = Assembler::AVX_512bit;
7220      BasicType elem_bt = Matcher::vector_element_basic_type(this);
7221     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, $scratch$$Register);
7222     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
7223   %}
7224   ins_pipe( pipe_slow );
7225 %}
7226 
7227 // --------------------------------- ABS --------------------------------------
7228 // a = |a|
7229 instruct vabsB_reg(vec dst, vec src) %{
7230   match(Set dst (AbsVB  src));
7231   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
7232   ins_encode %{
7233     uint vlen = Matcher::vector_length(this);
7234     if (vlen <= 16) {
7235       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7236     } else {
7237       int vlen_enc = vector_length_encoding(this);
7238       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7239     }
7240   %}
7241   ins_pipe( pipe_slow );
7242 %}
7243 
7244 instruct vabsS_reg(vec dst, vec src) %{
7245   match(Set dst (AbsVS  src));
7246   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
7247   ins_encode %{
7248     uint vlen = Matcher::vector_length(this);
7249     if (vlen <= 8) {
7250       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7251     } else {
7252       int vlen_enc = vector_length_encoding(this);
7253       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7254     }
7255   %}
7256   ins_pipe( pipe_slow );
7257 %}
7258 
7259 instruct vabsI_reg(vec dst, vec src) %{
7260   match(Set dst (AbsVI  src));
7261   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
7262   ins_encode %{
7263     uint vlen = Matcher::vector_length(this);
7264     if (vlen <= 4) {
7265       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7266     } else {
7267       int vlen_enc = vector_length_encoding(this);
7268       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7269     }
7270   %}
7271   ins_pipe( pipe_slow );
7272 %}
7273 
7274 instruct vabsL_reg(vec dst, vec src) %{
7275   match(Set dst (AbsVL  src));
7276   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
7277   ins_encode %{
7278     assert(UseAVX > 2, "required");
7279     int vlen_enc = vector_length_encoding(this);
7280     if (!VM_Version::supports_avx512vl()) {
7281       vlen_enc = Assembler::AVX_512bit;
7282     }
7283     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7284   %}
7285   ins_pipe( pipe_slow );
7286 %}
7287 
7288 // --------------------------------- ABSNEG --------------------------------------
7289 
7290 instruct vabsnegF(vec dst, vec src, rRegI scratch) %{
7291   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
7292   match(Set dst (AbsVF src));
7293   match(Set dst (NegVF src));
7294   effect(TEMP scratch);
7295   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
7296   ins_cost(150);
7297   ins_encode %{
7298     int opcode = this->ideal_Opcode();
7299     int vlen = Matcher::vector_length(this);
7300     if (vlen == 2) {
7301       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7302     } else {
7303       assert(vlen == 8 || vlen == 16, "required");
7304       int vlen_enc = vector_length_encoding(this);
7305       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7306     }
7307   %}
7308   ins_pipe( pipe_slow );
7309 %}
7310 
7311 instruct vabsneg4F(vec dst, rRegI scratch) %{
7312   predicate(Matcher::vector_length(n) == 4);
7313   match(Set dst (AbsVF dst));
7314   match(Set dst (NegVF dst));
7315   effect(TEMP scratch);
7316   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
7317   ins_cost(150);
7318   ins_encode %{
7319     int opcode = this->ideal_Opcode();
7320     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $scratch$$Register);
7321   %}
7322   ins_pipe( pipe_slow );
7323 %}
7324 
7325 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
7326   match(Set dst (AbsVD  src));
7327   match(Set dst (NegVD  src));
7328   effect(TEMP scratch);
7329   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
7330   ins_encode %{
7331     int opcode = this->ideal_Opcode();
7332     uint vlen = Matcher::vector_length(this);
7333     if (vlen == 2) {
7334       assert(UseSSE >= 2, "required");
7335       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7336     } else {
7337       int vlen_enc = vector_length_encoding(this);
7338       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7339     }
7340   %}
7341   ins_pipe( pipe_slow );
7342 %}
7343 
7344 //------------------------------------- VectorTest --------------------------------------------
7345 
7346 #ifdef _LP64
7347 instruct vptest_alltrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp1, legVec vtmp2, rFlagsReg cr) %{
7348   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
7349             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
7350             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7351   match(Set dst (VectorTest src1 src2 ));
7352   effect(TEMP vtmp1, TEMP vtmp2, KILL cr);
7353   format %{ "vector_test $dst,$src1, $src2\t! using $vtmp1, $vtmp2 and $cr as TEMP" %}
7354   ins_encode %{
7355     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7356     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
7357     __ setb(Assembler::carrySet, $dst$$Register);
7358     __ movzbl($dst$$Register, $dst$$Register);
7359   %}
7360   ins_pipe( pipe_slow );
7361 %}
7362 
7363 instruct vptest_alltrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7364   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
7365             Matcher::vector_length_in_bytes(n->in(1)) <  64 &&
7366             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7367   match(Set dst (VectorTest src1 src2 ));
7368   effect(KILL cr);
7369   format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %}
7370   ins_encode %{
7371     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7372     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7373     __ setb(Assembler::carrySet, $dst$$Register);
7374     __ movzbl($dst$$Register, $dst$$Register);
7375   %}
7376   ins_pipe( pipe_slow );
7377 %}
7378 
7379 instruct vptest_alltrue_evex(rRegI dst, legVec src1, legVec src2, kReg ktmp, rFlagsReg cr) %{
7380   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 64 &&
7381             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7382   match(Set dst (VectorTest src1 src2 ));
7383   effect(KILL cr, TEMP ktmp);
7384   format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %}
7385   ins_encode %{
7386     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7387     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, $ktmp$$KRegister);
7388     __ setb(Assembler::carrySet, $dst$$Register);
7389     __ movzbl($dst$$Register, $dst$$Register);
7390   %}
7391   ins_pipe( pipe_slow );
7392 %}
7393 
7394 instruct vptest_anytrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp, rFlagsReg cr) %{
7395   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
7396             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
7397             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7398   match(Set dst (VectorTest src1 src2 ));
7399   effect(TEMP vtmp, KILL cr);
7400   format %{ "vector_test_any_true $dst,$src1,$src2\t! using $vtmp, $cr as TEMP" %}
7401   ins_encode %{
7402     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7403     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7404     __ setb(Assembler::notZero, $dst$$Register);
7405     __ movzbl($dst$$Register, $dst$$Register);
7406   %}
7407   ins_pipe( pipe_slow );
7408 %}
7409 
7410 instruct vptest_anytrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7411   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
7412             Matcher::vector_length_in_bytes(n->in(1)) < 64  &&
7413             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7414   match(Set dst (VectorTest src1 src2 ));
7415   effect(KILL cr);
7416   format %{ "vector_test_any_true $dst,$src1,$src2\t! using $cr as TEMP" %}
7417   ins_encode %{
7418     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7419     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7420     __ setb(Assembler::notZero, $dst$$Register);
7421     __ movzbl($dst$$Register, $dst$$Register);
7422   %}
7423   ins_pipe( pipe_slow );
7424 %}
7425 
7426 instruct vptest_anytrue_evex(rRegI dst, legVec src1, legVec src2, kReg ktmp, rFlagsReg cr) %{
7427   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 64 &&
7428             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7429   match(Set dst (VectorTest src1 src2 ));
7430   effect(KILL cr, TEMP ktmp);
7431   format %{ "vector_test_any_true $dst,$src1,$src2\t! using $cr as TEMP" %}
7432   ins_encode %{
7433     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7434     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, $ktmp$$KRegister);
7435     __ setb(Assembler::notZero, $dst$$Register);
7436     __ movzbl($dst$$Register, $dst$$Register);
7437   %}
7438   ins_pipe( pipe_slow );
7439 %}
7440 
7441 instruct cmpvptest_anytrue_lt16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero, legVec vtmp) %{
7442   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 4 &&
7443             Matcher::vector_length_in_bytes(n->in(1)->in(1)) < 16 &&
7444             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7445   match(Set cr (CmpI (VectorTest src1 src2) zero));
7446   effect(TEMP vtmp);
7447   format %{ "cmp_vector_test_any_true $src1,$src2\t! using $vtmp as TEMP" %}
7448   ins_encode %{
7449     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7450     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7451   %}
7452   ins_pipe( pipe_slow );
7453 %}
7454 
7455 instruct cmpvptest_anytrue(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero) %{
7456   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 16 &&
7457             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <  64 &&
7458             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7459   match(Set cr (CmpI (VectorTest src1 src2) zero));
7460   format %{ "cmp_vector_test_any_true $src1,$src2\t!" %}
7461   ins_encode %{
7462     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7463     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7464   %}
7465   ins_pipe( pipe_slow );
7466 %}
7467 
7468 instruct cmpvptest_anytrue_evex(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero, kReg ktmp) %{
7469   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 &&
7470             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7471   match(Set cr (CmpI (VectorTest src1 src2) zero));
7472   effect(TEMP ktmp);
7473   format %{ "cmp_vector_test_any_true $src1,$src2\t!" %}
7474   ins_encode %{
7475     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7476     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, $ktmp$$KRegister);
7477   %}
7478   ins_pipe( pipe_slow );
7479 %}
7480 #endif
7481 
7482 //------------------------------------- LoadMask --------------------------------------------
7483 
7484 instruct loadMask(legVec dst, legVec src) %{
7485   predicate(!VM_Version::supports_avx512vlbw());
7486   match(Set dst (VectorLoadMask src));
7487   effect(TEMP dst);
7488   format %{ "vector_loadmask_byte $dst,$src\n\t" %}
7489   ins_encode %{
7490     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
7491     BasicType elem_bt = Matcher::vector_element_basic_type(this);
7492 
7493     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
7494   %}
7495   ins_pipe( pipe_slow );
7496 %}
7497 
7498 instruct loadMask_evex(vec dst, vec src) %{
7499   predicate(VM_Version::supports_avx512vlbw());
7500   match(Set dst (VectorLoadMask src));
7501   effect(TEMP dst);
7502   format %{ "vector_loadmask_byte $dst,$src\n\t" %}
7503   ins_encode %{
7504     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
7505     BasicType elem_bt = Matcher::vector_element_basic_type(this);
7506 
7507     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, false);
7508   %}
7509   ins_pipe( pipe_slow );
7510 %}
7511 
7512 //------------------------------------- StoreMask --------------------------------------------
7513 
7514 instruct storeMask1B(vec dst, vec src, immI_1 size) %{
7515   predicate(Matcher::vector_length(n) < 64 || VM_Version::supports_avx512vlbw());
7516   match(Set dst (VectorStoreMask src size));
7517   format %{ "vector_store_mask $dst,$src\t!" %}
7518   ins_encode %{
7519     assert(UseSSE >= 3, "required");
7520     if (Matcher::vector_length_in_bytes(this) <= 16) {
7521       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7522     } else {
7523       assert(UseAVX >= 2, "required");
7524       int src_vlen_enc = vector_length_encoding(this, $src);
7525       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7526     }
7527   %}
7528   ins_pipe( pipe_slow );
7529 %}
7530 
7531 instruct storeMask2B(vec dst, vec src, immI_2 size) %{
7532   predicate(Matcher::vector_length(n) <= 8);
7533   match(Set dst (VectorStoreMask src size));
7534   format %{ "vector_store_mask $dst,$src\n\t" %}
7535   ins_encode %{
7536     assert(UseSSE >= 3, "required");
7537     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7538     __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7539   %}
7540   ins_pipe( pipe_slow );
7541 %}
7542 
7543 instruct vstoreMask2B(vec dst, vec src, immI_2 size) %{
7544   predicate(Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
7545   match(Set dst (VectorStoreMask src size));
7546   effect(TEMP dst);
7547   format %{ "vector_store_mask $dst,$src\t!" %}
7548   ins_encode %{
7549     int vlen_enc = Assembler::AVX_128bit;
7550     __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
7551     __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister,vlen_enc);
7552     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7553   %}
7554   ins_pipe( pipe_slow );
7555 %}
7556 
7557 instruct vstoreMask2B_evex(vec dst, vec src, immI_2 size) %{
7558   predicate(VM_Version::supports_avx512bw());
7559   match(Set dst (VectorStoreMask src size));
7560   format %{ "vector_store_mask $dst,$src\t!" %}
7561   ins_encode %{
7562     int src_vlen_enc = vector_length_encoding(this, $src);
7563     int dst_vlen_enc = vector_length_encoding(this);
7564     __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7565     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7566   %}
7567   ins_pipe( pipe_slow );
7568 %}
7569 
7570 instruct storeMask4B(vec dst, vec src, immI_4 size) %{
7571   predicate(Matcher::vector_length(n) <= 4 && UseAVX <= 2);
7572   match(Set dst (VectorStoreMask src size));
7573   format %{ "vector_store_mask $dst,$src\t!" %}
7574   ins_encode %{
7575     assert(UseSSE >= 3, "required");
7576     __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7577     __ packssdw($dst$$XMMRegister, $dst$$XMMRegister);
7578     __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7579   %}
7580   ins_pipe( pipe_slow );
7581 %}
7582 
7583 instruct vstoreMask4B(vec dst, vec src, immI_4 size) %{
7584   predicate(Matcher::vector_length(n) == 8 && UseAVX <= 2);
7585   match(Set dst (VectorStoreMask src size));
7586   format %{ "vector_store_mask $dst,$src\t!" %}
7587   effect(TEMP dst);
7588   ins_encode %{
7589     int vlen_enc = Assembler::AVX_128bit;
7590     __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
7591     __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7592     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7593     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7594   %}
7595   ins_pipe( pipe_slow );
7596 %}
7597 
7598 instruct vstoreMask4B_evex(vec dst, vec src, immI_4 size) %{
7599   predicate(UseAVX > 2);
7600   match(Set dst (VectorStoreMask src size));
7601   format %{ "vector_store_mask $dst,$src\t!" %}
7602   ins_encode %{
7603     int src_vlen_enc = vector_length_encoding(this, $src);
7604     int dst_vlen_enc = vector_length_encoding(this);
7605     if (!VM_Version::supports_avx512vl()) {
7606       src_vlen_enc = Assembler::AVX_512bit;
7607     }
7608     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7609     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7610   %}
7611   ins_pipe( pipe_slow );
7612 %}
7613 
7614 instruct storeMask8B(vec dst, vec src, immI_8 size) %{
7615   predicate(Matcher::vector_length(n) == 2 && UseAVX <= 2);
7616   match(Set dst (VectorStoreMask src size));
7617   format %{ "vector_store_mask $dst,$src\t!" %}
7618   ins_encode %{
7619     assert(UseSSE >= 3, "required");
7620     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
7621     __ packssdw($dst$$XMMRegister, $dst$$XMMRegister);
7622     __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7623     __ pabsb($dst$$XMMRegister, $dst$$XMMRegister);
7624   %}
7625   ins_pipe( pipe_slow );
7626 %}
7627 
7628 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, legVec vtmp) %{
7629   predicate(Matcher::vector_length(n) == 4 && UseAVX <= 2);
7630   match(Set dst (VectorStoreMask src size));
7631   format %{ "vector_store_mask $dst,$src\t! using $vtmp as TEMP" %}
7632   effect(TEMP dst, TEMP vtmp);
7633   ins_encode %{
7634     int vlen_enc = Assembler::AVX_128bit;
7635     __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
7636     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
7637     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
7638     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7639     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7640     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7641   %}
7642   ins_pipe( pipe_slow );
7643 %}
7644 
7645 instruct vstoreMask8B_evex(vec dst, vec src, immI_8 size) %{
7646   predicate(UseAVX > 2);
7647   match(Set dst (VectorStoreMask src size));
7648   format %{ "vector_store_mask $dst,$src\t!" %}
7649   ins_encode %{
7650     int src_vlen_enc = vector_length_encoding(this, $src);
7651     int dst_vlen_enc = vector_length_encoding(this);
7652     if (!VM_Version::supports_avx512vl()) {
7653       src_vlen_enc = Assembler::AVX_512bit;
7654     }
7655     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7656     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7657   %}
7658   ins_pipe( pipe_slow );
7659 %}
7660 
7661 instruct vmaskcast(vec dst) %{
7662   predicate((Matcher::vector_length(n) == Matcher::vector_length(n->in(1))) &&
7663             (Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))));
7664   match(Set dst (VectorMaskCast dst));
7665   ins_cost(0);
7666   format %{ "vector_mask_cast $dst" %}
7667   ins_encode %{
7668     // empty
7669   %}
7670   ins_pipe(empty);
7671 %}
7672 
7673 //-------------------------------- Load Iota Indices ----------------------------------
7674 
7675 instruct loadIotaIndices(vec dst, immI_0 src, rRegP scratch) %{
7676   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
7677   match(Set dst (VectorLoadConst src));
7678   effect(TEMP scratch);
7679   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
7680   ins_encode %{
7681      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
7682      __ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen_in_bytes);
7683   %}
7684   ins_pipe( pipe_slow );
7685 %}
7686 
7687 //-------------------------------- Rearrange ----------------------------------
7688 
7689 // LoadShuffle/Rearrange for Byte
7690 
7691 instruct loadShuffleB(vec dst) %{
7692   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
7693   match(Set dst (VectorLoadShuffle dst));
7694   format %{ "vector_load_shuffle $dst, $dst" %}
7695   ins_encode %{
7696     // empty
7697   %}
7698   ins_pipe( pipe_slow );
7699 %}
7700 
7701 instruct rearrangeB(vec dst, vec shuffle) %{
7702   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
7703             Matcher::vector_length(n) < 32);
7704   match(Set dst (VectorRearrange dst shuffle));
7705   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7706   ins_encode %{
7707     assert(UseSSE >= 4, "required");
7708     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7709   %}
7710   ins_pipe( pipe_slow );
7711 %}
7712 
7713 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
7714   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
7715             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
7716   match(Set dst (VectorRearrange src shuffle));
7717   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
7718   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
7719   ins_encode %{
7720     assert(UseAVX >= 2, "required");
7721     // Swap src into vtmp1
7722     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
7723     // Shuffle swapped src to get entries from other 128 bit lane
7724     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7725     // Shuffle original src to get entries from self 128 bit lane
7726     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7727     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
7728     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
7729     // Perform the blend
7730     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
7731   %}
7732   ins_pipe( pipe_slow );
7733 %}
7734 
7735 instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{
7736   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
7737             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
7738   match(Set dst (VectorRearrange src shuffle));
7739   format %{ "vector_rearrange $dst, $shuffle, $src" %}
7740   ins_encode %{
7741     int vlen_enc = vector_length_encoding(this);
7742     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7743   %}
7744   ins_pipe( pipe_slow );
7745 %}
7746 
7747 // LoadShuffle/Rearrange for Short
7748 
7749 instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{
7750   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
7751             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
7752   match(Set dst (VectorLoadShuffle src));
7753   effect(TEMP dst, TEMP vtmp, TEMP scratch);
7754   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7755   ins_encode %{
7756     // Create a byte shuffle mask from short shuffle mask
7757     // only byte shuffle instruction available on these platforms
7758     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
7759     if (UseAVX == 0) {
7760       assert(vlen_in_bytes <= 16, "required");
7761       // Multiply each shuffle by two to get byte index
7762       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
7763       __ psllw($vtmp$$XMMRegister, 1);
7764 
7765       // Duplicate to create 2 copies of byte index
7766       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
7767       __ psllw($dst$$XMMRegister, 8);
7768       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
7769 
7770       // Add one to get alternate byte index
7771       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
7772       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
7773     } else {
7774       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
7775       int vlen_enc = vector_length_encoding(this);
7776       // Multiply each shuffle by two to get byte index
7777       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
7778       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
7779 
7780       // Duplicate to create 2 copies of byte index
7781       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
7782       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
7783 
7784       // Add one to get alternate byte index
7785       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, $scratch$$Register);
7786     }
7787   %}
7788   ins_pipe( pipe_slow );
7789 %}
7790 
7791 instruct rearrangeS(vec dst, vec shuffle) %{
7792   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
7793             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
7794   match(Set dst (VectorRearrange dst shuffle));
7795   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7796   ins_encode %{
7797     assert(UseSSE >= 4, "required");
7798     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7799   %}
7800   ins_pipe( pipe_slow );
7801 %}
7802 
7803 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
7804   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
7805             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
7806   match(Set dst (VectorRearrange src shuffle));
7807   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
7808   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
7809   ins_encode %{
7810     assert(UseAVX >= 2, "required");
7811     // Swap src into vtmp1
7812     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
7813     // Shuffle swapped src to get entries from other 128 bit lane
7814     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7815     // Shuffle original src to get entries from self 128 bit lane
7816     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7817     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
7818     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
7819     // Perform the blend
7820     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
7821   %}
7822   ins_pipe( pipe_slow );
7823 %}
7824 
7825 instruct loadShuffleS_evex(vec dst, vec src) %{
7826   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
7827             VM_Version::supports_avx512bw());
7828   match(Set dst (VectorLoadShuffle src));
7829   format %{ "vector_load_shuffle $dst, $src" %}
7830   ins_encode %{
7831     int vlen_enc = vector_length_encoding(this);
7832     if (!VM_Version::supports_avx512vl()) {
7833       vlen_enc = Assembler::AVX_512bit;
7834     }
7835     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7836   %}
7837   ins_pipe( pipe_slow );
7838 %}
7839 
7840 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
7841   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
7842             VM_Version::supports_avx512bw());
7843   match(Set dst (VectorRearrange src shuffle));
7844   format %{ "vector_rearrange $dst, $shuffle, $src" %}
7845   ins_encode %{
7846     int vlen_enc = vector_length_encoding(this);
7847     if (!VM_Version::supports_avx512vl()) {
7848       vlen_enc = Assembler::AVX_512bit;
7849     }
7850     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7851   %}
7852   ins_pipe( pipe_slow );
7853 %}
7854 
7855 // LoadShuffle/Rearrange for Integer and Float
7856 
7857 instruct loadShuffleI(vec dst, vec src, vec vtmp, rRegP scratch) %{
7858   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
7859             Matcher::vector_length(n) == 4 && UseAVX < 2);
7860   match(Set dst (VectorLoadShuffle src));
7861   effect(TEMP dst, TEMP vtmp, TEMP scratch);
7862   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7863   ins_encode %{
7864     assert(UseSSE >= 4, "required");
7865 
7866     // Create a byte shuffle mask from int shuffle mask
7867     // only byte shuffle instruction available on these platforms
7868 
7869     // Duplicate and multiply each shuffle by 4
7870     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
7871     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
7872     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
7873     __ psllw($vtmp$$XMMRegister, 2);
7874 
7875     // Duplicate again to create 4 copies of byte index
7876     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
7877     __ psllw($dst$$XMMRegister, 8);
7878     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
7879 
7880     // Add 3,2,1,0 to get alternate byte index
7881     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), $scratch$$Register);
7882     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
7883   %}
7884   ins_pipe( pipe_slow );
7885 %}
7886 
7887 instruct rearrangeI(vec dst, vec shuffle) %{
7888  predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
7889            Matcher::vector_length(n) == 4 && UseAVX < 2);
7890   match(Set dst (VectorRearrange dst shuffle));
7891   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7892   ins_encode %{
7893     assert(UseSSE >= 4, "required");
7894     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7895   %}
7896   ins_pipe( pipe_slow );
7897 %}
7898 
7899 instruct loadShuffleI_avx(vec dst, vec src) %{
7900   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
7901             UseAVX >= 2);
7902   match(Set dst (VectorLoadShuffle src));
7903   format %{ "vector_load_shuffle $dst, $src" %}
7904   ins_encode %{
7905   int vlen_enc = vector_length_encoding(this);
7906     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7907   %}
7908   ins_pipe( pipe_slow );
7909 %}
7910 
7911 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
7912   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
7913             UseAVX >= 2);
7914   match(Set dst (VectorRearrange src shuffle));
7915   format %{ "vector_rearrange $dst, $shuffle, $src" %}
7916   ins_encode %{
7917     int vlen_enc = vector_length_encoding(this);
7918     if (vlen_enc == Assembler::AVX_128bit) {
7919       vlen_enc = Assembler::AVX_256bit;
7920     }
7921     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7922   %}
7923   ins_pipe( pipe_slow );
7924 %}
7925 
7926 // LoadShuffle/Rearrange for Long and Double
7927 
7928 instruct loadShuffleL(vec dst, vec src, vec vtmp, rRegP scratch) %{
7929   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7930             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
7931   match(Set dst (VectorLoadShuffle src));
7932   effect(TEMP dst, TEMP vtmp, TEMP scratch);
7933   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7934   ins_encode %{
7935     assert(UseAVX >= 2, "required");
7936 
7937     int vlen_enc = vector_length_encoding(this);
7938     // Create a double word shuffle mask from long shuffle mask
7939     // only double word shuffle instruction available on these platforms
7940 
7941     // Multiply each shuffle by two to get double word index
7942     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
7943     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
7944 
7945     // Duplicate each double word shuffle
7946     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
7947     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
7948 
7949     // Add one to get alternate double word index
7950     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, $scratch$$Register);
7951   %}
7952   ins_pipe( pipe_slow );
7953 %}
7954 
7955 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
7956   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7957             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
7958   match(Set dst (VectorRearrange src shuffle));
7959   format %{ "vector_rearrange $dst, $shuffle, $src" %}
7960   ins_encode %{
7961     assert(UseAVX >= 2, "required");
7962 
7963     int vlen_enc = vector_length_encoding(this);
7964     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7965   %}
7966   ins_pipe( pipe_slow );
7967 %}
7968 
7969 instruct loadShuffleL_evex(vec dst, vec src) %{
7970   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7971             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
7972   match(Set dst (VectorLoadShuffle src));
7973   format %{ "vector_load_shuffle $dst, $src" %}
7974   ins_encode %{
7975     assert(UseAVX > 2, "required");
7976 
7977     int vlen_enc = vector_length_encoding(this);
7978     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7979   %}
7980   ins_pipe( pipe_slow );
7981 %}
7982 
7983 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
7984   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7985             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
7986   match(Set dst (VectorRearrange src shuffle));
7987   format %{ "vector_rearrange $dst, $shuffle, $src" %}
7988   ins_encode %{
7989     assert(UseAVX > 2, "required");
7990 
7991     int vlen_enc = vector_length_encoding(this);
7992     if (vlen_enc == Assembler::AVX_128bit) {
7993       vlen_enc = Assembler::AVX_256bit;
7994     }
7995     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7996   %}
7997   ins_pipe( pipe_slow );
7998 %}
7999 
8000 // --------------------------------- FMA --------------------------------------
8001 // a * b + c
8002 
8003 instruct vfmaF_reg(vec a, vec b, vec c) %{
8004   match(Set c (FmaVF  c (Binary a b)));
8005   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8006   ins_cost(150);
8007   ins_encode %{
8008     assert(UseFMA, "not enabled");
8009     int vlen_enc = vector_length_encoding(this);
8010     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8011   %}
8012   ins_pipe( pipe_slow );
8013 %}
8014 
8015 instruct vfmaF_mem(vec a, memory b, vec c) %{
8016   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8017   match(Set c (FmaVF  c (Binary a (LoadVector b))));
8018   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8019   ins_cost(150);
8020   ins_encode %{
8021     assert(UseFMA, "not enabled");
8022     int vlen_enc = vector_length_encoding(this);
8023     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8024   %}
8025   ins_pipe( pipe_slow );
8026 %}
8027 
8028 instruct vfmaD_reg(vec a, vec b, vec c) %{
8029   match(Set c (FmaVD  c (Binary a b)));
8030   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8031   ins_cost(150);
8032   ins_encode %{
8033     assert(UseFMA, "not enabled");
8034     int vlen_enc = vector_length_encoding(this);
8035     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8036   %}
8037   ins_pipe( pipe_slow );
8038 %}
8039 
8040 instruct vfmaD_mem(vec a, memory b, vec c) %{
8041   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8042   match(Set c (FmaVD  c (Binary a (LoadVector b))));
8043   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8044   ins_cost(150);
8045   ins_encode %{
8046     assert(UseFMA, "not enabled");
8047     int vlen_enc = vector_length_encoding(this);
8048     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8049   %}
8050   ins_pipe( pipe_slow );
8051 %}
8052 
8053 // --------------------------------- Vector Multiply Add --------------------------------------
8054 
8055 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
8056   predicate(UseAVX == 0);
8057   match(Set dst (MulAddVS2VI dst src1));
8058   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
8059   ins_encode %{
8060     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
8061   %}
8062   ins_pipe( pipe_slow );
8063 %}
8064 
8065 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
8066   predicate(UseAVX > 0);
8067   match(Set dst (MulAddVS2VI src1 src2));
8068   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
8069   ins_encode %{
8070     int vlen_enc = vector_length_encoding(this);
8071     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8072   %}
8073   ins_pipe( pipe_slow );
8074 %}
8075 
8076 // --------------------------------- Vector Multiply Add Add ----------------------------------
8077 
8078 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
8079   predicate(VM_Version::supports_avx512_vnni());
8080   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
8081   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
8082   ins_encode %{
8083     assert(UseAVX > 2, "required");
8084     int vlen_enc = vector_length_encoding(this);
8085     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8086   %}
8087   ins_pipe( pipe_slow );
8088   ins_cost(10);
8089 %}
8090 
8091 // --------------------------------- PopCount --------------------------------------
8092 
8093 instruct vpopcountI(vec dst, vec src) %{
8094   match(Set dst (PopCountVI src));
8095   format %{ "vpopcntd  $dst,$src\t! vector popcount packedI" %}
8096   ins_encode %{
8097     assert(UsePopCountInstruction, "not enabled");
8098 
8099     int vlen_enc = vector_length_encoding(this);
8100     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8101   %}
8102   ins_pipe( pipe_slow );
8103 %}
8104 
8105 // --------------------------------- Bitwise Ternary Logic ----------------------------------
8106 
8107 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
8108   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
8109   effect(TEMP dst);
8110   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8111   ins_encode %{
8112     int vector_len = vector_length_encoding(this);
8113     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
8114   %}
8115   ins_pipe( pipe_slow );
8116 %}
8117 
8118 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
8119   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
8120   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
8121   effect(TEMP dst);
8122   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8123   ins_encode %{
8124     int vector_len = vector_length_encoding(this);
8125     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
8126   %}
8127   ins_pipe( pipe_slow );
8128 %}
8129 
8130 // --------------------------------- Rotation Operations ----------------------------------
8131 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
8132   match(Set dst (RotateLeftV src shift));
8133   match(Set dst (RotateRightV src shift));
8134   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
8135   ins_encode %{
8136     int opcode      = this->ideal_Opcode();
8137     int vector_len  = vector_length_encoding(this);
8138     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8139     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
8140   %}
8141   ins_pipe( pipe_slow );
8142 %}
8143 
8144 instruct vprorate(vec dst, vec src, vec shift) %{
8145   match(Set dst (RotateLeftV src shift));
8146   match(Set dst (RotateRightV src shift));
8147   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
8148   ins_encode %{
8149     int opcode      = this->ideal_Opcode();
8150     int vector_len  = vector_length_encoding(this);
8151     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8152     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8153   %}
8154   ins_pipe( pipe_slow );
8155 %}
8156 
8157 #ifdef _LP64
8158 // ---------------------------------- Masked Operations ------------------------------------
8159 
8160 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
8161   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
8162   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
8163   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
8164   ins_encode %{
8165     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
8166     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
8167 
8168     Label DONE;
8169     int vlen_enc = vector_length_encoding(this, $src1);
8170     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
8171 
8172     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
8173     __ mov64($dst$$Register, -1L);
8174     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
8175     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
8176     __ jccb(Assembler::carrySet, DONE);
8177     __ kmovql($dst$$Register, $ktmp1$$KRegister);
8178     __ notq($dst$$Register);
8179     __ tzcntq($dst$$Register, $dst$$Register);
8180     __ bind(DONE);
8181   %}
8182   ins_pipe( pipe_slow );
8183 %}
8184 
8185 
8186 instruct vmasked_load64(vec dst, memory mem, kReg mask) %{
8187   match(Set dst (LoadVectorMasked mem mask));
8188   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
8189   ins_encode %{
8190     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
8191     int vector_len = vector_length_encoding(this);
8192     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, vector_len);
8193   %}
8194   ins_pipe( pipe_slow );
8195 %}
8196 
8197 instruct vmask_gen(kReg dst, rRegL len, rRegL temp) %{
8198   match(Set dst (VectorMaskGen len));
8199   effect(TEMP temp);
8200   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
8201   ins_encode %{
8202     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
8203   %}
8204   ins_pipe( pipe_slow );
8205 %}
8206 
8207 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
8208   match(Set dst (VectorMaskGen len));
8209   format %{ "vector_mask_gen $len \t! vector mask generator" %}
8210   effect(TEMP temp);
8211   ins_encode %{
8212     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
8213     __ kmovql($dst$$KRegister, $temp$$Register);
8214   %}
8215   ins_pipe( pipe_slow );
8216 %}
8217 
8218 instruct vmasked_store64(memory mem, vec src, kReg mask) %{
8219   match(Set mem (StoreVectorMasked mem (Binary src mask)));
8220   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
8221   ins_encode %{
8222     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
8223     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
8224     int vector_len = vector_length_encoding(src_node);
8225     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, vector_len);
8226   %}
8227   ins_pipe( pipe_slow );
8228 %}
8229 
8230 instruct vmask_truecount_evex(rRegI dst, vec mask, rRegL tmp, kReg ktmp, vec xtmp) %{
8231   predicate(VM_Version::supports_avx512vlbw());
8232   match(Set dst (VectorMaskTrueCount mask));
8233   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp, TEMP xtmp);
8234   format %{ "vector_truecount_evex $mask \t! vector mask true count" %}
8235   ins_encode %{
8236     int opcode = this->ideal_Opcode();
8237     int vlen_enc = vector_length_encoding(this, $mask);
8238     int mask_len = Matcher::vector_length(this, $mask);
8239     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8240                              $tmp$$Register, $ktmp$$KRegister, mask_len, vlen_enc);
8241   %}
8242   ins_pipe( pipe_slow );
8243 %}
8244 
8245 instruct vmask_first_or_last_true_evex(rRegI dst, vec mask, rRegL tmp, kReg ktmp, vec xtmp, rFlagsReg cr) %{
8246   predicate(VM_Version::supports_avx512vlbw());
8247   match(Set dst (VectorMaskFirstTrue mask));
8248   match(Set dst (VectorMaskLastTrue mask));
8249   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp, TEMP xtmp, KILL cr);
8250   format %{ "vector_mask_first_or_last_true_evex $mask \t! vector first/last true location" %}
8251   ins_encode %{
8252     int opcode = this->ideal_Opcode();
8253     int vlen_enc = vector_length_encoding(this, $mask);
8254     int mask_len = Matcher::vector_length(this, $mask);
8255     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8256                              $tmp$$Register, $ktmp$$KRegister, mask_len, vlen_enc);
8257   %}
8258   ins_pipe( pipe_slow );
8259 %}
8260 
8261 instruct vmask_truecount_avx(rRegI dst, vec mask, rRegL tmp, vec xtmp, vec xtmp1) %{
8262   predicate(!VM_Version::supports_avx512vlbw());
8263   match(Set dst (VectorMaskTrueCount mask));
8264   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, TEMP xtmp1);
8265   format %{ "vector_truecount_avx $mask \t! vector mask true count" %}
8266   ins_encode %{
8267     int opcode = this->ideal_Opcode();
8268     int vlen_enc = vector_length_encoding(this, $mask);
8269     int mask_len = Matcher::vector_length(this, $mask);
8270     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8271                              $xtmp1$$XMMRegister, $tmp$$Register, mask_len, vlen_enc);
8272   %}
8273   ins_pipe( pipe_slow );
8274 %}
8275 
8276 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, rRegL tmp, vec xtmp, vec xtmp1, rFlagsReg cr) %{
8277   predicate(!VM_Version::supports_avx512vlbw());
8278   match(Set dst (VectorMaskFirstTrue mask));
8279   match(Set dst (VectorMaskLastTrue mask));
8280   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, TEMP xtmp1, KILL cr);
8281   format %{ "vector_mask_first_or_last_true_avx $mask \t! vector first/last true location" %}
8282   ins_encode %{
8283     int opcode = this->ideal_Opcode();
8284     int vlen_enc = vector_length_encoding(this, $mask);
8285     int mask_len = Matcher::vector_length(this, $mask);
8286     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8287                              $xtmp1$$XMMRegister, $tmp$$Register, mask_len, vlen_enc);
8288   %}
8289   ins_pipe( pipe_slow );
8290 %}
8291 #endif // _LP64
8292 
8293 instruct castVV(vec dst)
8294 %{
8295   match(Set dst (CastVV dst));
8296 
8297   size(0);
8298   format %{ "# castVV of $dst" %}
8299   ins_encode(/* empty encoding */);
8300   ins_cost(0);
8301   ins_pipe(empty);
8302 %}
8303 
8304 instruct castVVLeg(legVec dst)
8305 %{
8306   match(Set dst (CastVV dst));
8307 
8308   size(0);
8309   format %{ "# castVV of $dst" %}
8310   ins_encode(/* empty encoding */);
8311   ins_cost(0);
8312   ins_pipe(empty);
8313 %}