1 //
   2 // Copyright (c) 2011, 2021, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 // AVX3 Mask Registers.
 632 reg_def K1   (SOC, SOC, Op_RegI,  1, k1->as_VMReg());
 633 reg_def K1_H (SOC, SOC, Op_RegI,  1, k1->as_VMReg()->next());
 634 
 635 reg_def K2   (SOC, SOC, Op_RegI,  2, k2->as_VMReg());
 636 reg_def K2_H (SOC, SOC, Op_RegI,  2, k2->as_VMReg()->next());
 637 
 638 reg_def K3   (SOC, SOC, Op_RegI,  3, k3->as_VMReg());
 639 reg_def K3_H (SOC, SOC, Op_RegI,  3, k3->as_VMReg()->next());
 640 
 641 reg_def K4   (SOC, SOC, Op_RegI,  4, k4->as_VMReg());
 642 reg_def K4_H (SOC, SOC, Op_RegI,  4, k4->as_VMReg()->next());
 643 
 644 reg_def K5   (SOC, SOC, Op_RegI,  5, k5->as_VMReg());
 645 reg_def K5_H (SOC, SOC, Op_RegI,  5, k5->as_VMReg()->next());
 646 
 647 reg_def K6   (SOC, SOC, Op_RegI,  6, k6->as_VMReg());
 648 reg_def K6_H (SOC, SOC, Op_RegI,  6, k6->as_VMReg()->next());
 649 
 650 reg_def K7   (SOC, SOC, Op_RegI,  7, k7->as_VMReg());
 651 reg_def K7_H (SOC, SOC, Op_RegI,  7, k7->as_VMReg()->next());
 652 
 653 
 654 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 655                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 656                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 657                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 658                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 659                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 660                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 661                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 662 #ifdef _LP64
 663                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 664                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 665                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 666                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 667                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 668                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 669                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 670                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 671                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 672                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 673                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 674                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 675                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 676                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 677                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 678                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 679                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 680                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 681                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 682                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 683                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 684                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 685                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 686                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 687 #endif
 688                       );
 689 
 690 alloc_class chunk2(K7, K7_H,
 691                    K6, K6_H,
 692                    K5, K5_H,
 693                    K4, K4_H,
 694                    K3, K3_H,
 695                    K2, K2_H,
 696                    K1, K1_H);
 697 
 698 reg_class  vectmask_reg(K1, K1_H,
 699                         K2, K2_H,
 700                         K3, K3_H,
 701                         K4, K4_H,
 702                         K5, K5_H,
 703                         K6, K6_H,
 704                         K7, K7_H);
 705 
 706 reg_class vectmask_reg_K1(K1, K1_H);
 707 reg_class vectmask_reg_K2(K2, K2_H);
 708 reg_class vectmask_reg_K3(K3, K3_H);
 709 reg_class vectmask_reg_K4(K4, K4_H);
 710 reg_class vectmask_reg_K5(K5, K5_H);
 711 reg_class vectmask_reg_K6(K6, K6_H);
 712 reg_class vectmask_reg_K7(K7, K7_H);
 713 
 714 // flags allocation class should be last.
 715 alloc_class chunk3(RFLAGS);
 716 
 717 
 718 // Singleton class for condition codes
 719 reg_class int_flags(RFLAGS);
 720 
 721 // Class for pre evex float registers
 722 reg_class float_reg_legacy(XMM0,
 723                     XMM1,
 724                     XMM2,
 725                     XMM3,
 726                     XMM4,
 727                     XMM5,
 728                     XMM6,
 729                     XMM7
 730 #ifdef _LP64
 731                    ,XMM8,
 732                     XMM9,
 733                     XMM10,
 734                     XMM11,
 735                     XMM12,
 736                     XMM13,
 737                     XMM14,
 738                     XMM15
 739 #endif
 740                     );
 741 
 742 // Class for evex float registers
 743 reg_class float_reg_evex(XMM0,
 744                     XMM1,
 745                     XMM2,
 746                     XMM3,
 747                     XMM4,
 748                     XMM5,
 749                     XMM6,
 750                     XMM7
 751 #ifdef _LP64
 752                    ,XMM8,
 753                     XMM9,
 754                     XMM10,
 755                     XMM11,
 756                     XMM12,
 757                     XMM13,
 758                     XMM14,
 759                     XMM15,
 760                     XMM16,
 761                     XMM17,
 762                     XMM18,
 763                     XMM19,
 764                     XMM20,
 765                     XMM21,
 766                     XMM22,
 767                     XMM23,
 768                     XMM24,
 769                     XMM25,
 770                     XMM26,
 771                     XMM27,
 772                     XMM28,
 773                     XMM29,
 774                     XMM30,
 775                     XMM31
 776 #endif
 777                     );
 778 
 779 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 780 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 781 
 782 // Class for pre evex double registers
 783 reg_class double_reg_legacy(XMM0,  XMM0b,
 784                      XMM1,  XMM1b,
 785                      XMM2,  XMM2b,
 786                      XMM3,  XMM3b,
 787                      XMM4,  XMM4b,
 788                      XMM5,  XMM5b,
 789                      XMM6,  XMM6b,
 790                      XMM7,  XMM7b
 791 #ifdef _LP64
 792                     ,XMM8,  XMM8b,
 793                      XMM9,  XMM9b,
 794                      XMM10, XMM10b,
 795                      XMM11, XMM11b,
 796                      XMM12, XMM12b,
 797                      XMM13, XMM13b,
 798                      XMM14, XMM14b,
 799                      XMM15, XMM15b
 800 #endif
 801                      );
 802 
 803 // Class for evex double registers
 804 reg_class double_reg_evex(XMM0,  XMM0b,
 805                      XMM1,  XMM1b,
 806                      XMM2,  XMM2b,
 807                      XMM3,  XMM3b,
 808                      XMM4,  XMM4b,
 809                      XMM5,  XMM5b,
 810                      XMM6,  XMM6b,
 811                      XMM7,  XMM7b
 812 #ifdef _LP64
 813                     ,XMM8,  XMM8b,
 814                      XMM9,  XMM9b,
 815                      XMM10, XMM10b,
 816                      XMM11, XMM11b,
 817                      XMM12, XMM12b,
 818                      XMM13, XMM13b,
 819                      XMM14, XMM14b,
 820                      XMM15, XMM15b,
 821                      XMM16, XMM16b,
 822                      XMM17, XMM17b,
 823                      XMM18, XMM18b,
 824                      XMM19, XMM19b,
 825                      XMM20, XMM20b,
 826                      XMM21, XMM21b,
 827                      XMM22, XMM22b,
 828                      XMM23, XMM23b,
 829                      XMM24, XMM24b,
 830                      XMM25, XMM25b,
 831                      XMM26, XMM26b,
 832                      XMM27, XMM27b,
 833                      XMM28, XMM28b,
 834                      XMM29, XMM29b,
 835                      XMM30, XMM30b,
 836                      XMM31, XMM31b
 837 #endif
 838                      );
 839 
 840 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 841 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 842 
 843 // Class for pre evex 32bit vector registers
 844 reg_class vectors_reg_legacy(XMM0,
 845                       XMM1,
 846                       XMM2,
 847                       XMM3,
 848                       XMM4,
 849                       XMM5,
 850                       XMM6,
 851                       XMM7
 852 #ifdef _LP64
 853                      ,XMM8,
 854                       XMM9,
 855                       XMM10,
 856                       XMM11,
 857                       XMM12,
 858                       XMM13,
 859                       XMM14,
 860                       XMM15
 861 #endif
 862                       );
 863 
 864 // Class for evex 32bit vector registers
 865 reg_class vectors_reg_evex(XMM0,
 866                       XMM1,
 867                       XMM2,
 868                       XMM3,
 869                       XMM4,
 870                       XMM5,
 871                       XMM6,
 872                       XMM7
 873 #ifdef _LP64
 874                      ,XMM8,
 875                       XMM9,
 876                       XMM10,
 877                       XMM11,
 878                       XMM12,
 879                       XMM13,
 880                       XMM14,
 881                       XMM15,
 882                       XMM16,
 883                       XMM17,
 884                       XMM18,
 885                       XMM19,
 886                       XMM20,
 887                       XMM21,
 888                       XMM22,
 889                       XMM23,
 890                       XMM24,
 891                       XMM25,
 892                       XMM26,
 893                       XMM27,
 894                       XMM28,
 895                       XMM29,
 896                       XMM30,
 897                       XMM31
 898 #endif
 899                       );
 900 
 901 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 902 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 903 
 904 // Class for all 64bit vector registers
 905 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 906                       XMM1,  XMM1b,
 907                       XMM2,  XMM2b,
 908                       XMM3,  XMM3b,
 909                       XMM4,  XMM4b,
 910                       XMM5,  XMM5b,
 911                       XMM6,  XMM6b,
 912                       XMM7,  XMM7b
 913 #ifdef _LP64
 914                      ,XMM8,  XMM8b,
 915                       XMM9,  XMM9b,
 916                       XMM10, XMM10b,
 917                       XMM11, XMM11b,
 918                       XMM12, XMM12b,
 919                       XMM13, XMM13b,
 920                       XMM14, XMM14b,
 921                       XMM15, XMM15b
 922 #endif
 923                       );
 924 
 925 // Class for all 64bit vector registers
 926 reg_class vectord_reg_evex(XMM0,  XMM0b,
 927                       XMM1,  XMM1b,
 928                       XMM2,  XMM2b,
 929                       XMM3,  XMM3b,
 930                       XMM4,  XMM4b,
 931                       XMM5,  XMM5b,
 932                       XMM6,  XMM6b,
 933                       XMM7,  XMM7b
 934 #ifdef _LP64
 935                      ,XMM8,  XMM8b,
 936                       XMM9,  XMM9b,
 937                       XMM10, XMM10b,
 938                       XMM11, XMM11b,
 939                       XMM12, XMM12b,
 940                       XMM13, XMM13b,
 941                       XMM14, XMM14b,
 942                       XMM15, XMM15b,
 943                       XMM16, XMM16b,
 944                       XMM17, XMM17b,
 945                       XMM18, XMM18b,
 946                       XMM19, XMM19b,
 947                       XMM20, XMM20b,
 948                       XMM21, XMM21b,
 949                       XMM22, XMM22b,
 950                       XMM23, XMM23b,
 951                       XMM24, XMM24b,
 952                       XMM25, XMM25b,
 953                       XMM26, XMM26b,
 954                       XMM27, XMM27b,
 955                       XMM28, XMM28b,
 956                       XMM29, XMM29b,
 957                       XMM30, XMM30b,
 958                       XMM31, XMM31b
 959 #endif
 960                       );
 961 
 962 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 963 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 964 
 965 // Class for all 128bit vector registers
 966 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 967                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 968                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 969                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 970                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 971                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 972                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 973                       XMM7,  XMM7b,  XMM7c,  XMM7d
 974 #ifdef _LP64
 975                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 976                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 977                       XMM10, XMM10b, XMM10c, XMM10d,
 978                       XMM11, XMM11b, XMM11c, XMM11d,
 979                       XMM12, XMM12b, XMM12c, XMM12d,
 980                       XMM13, XMM13b, XMM13c, XMM13d,
 981                       XMM14, XMM14b, XMM14c, XMM14d,
 982                       XMM15, XMM15b, XMM15c, XMM15d
 983 #endif
 984                       );
 985 
 986 // Class for all 128bit vector registers
 987 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 988                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 989                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 990                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 991                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 992                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 993                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 994                       XMM7,  XMM7b,  XMM7c,  XMM7d
 995 #ifdef _LP64
 996                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 997                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 998                       XMM10, XMM10b, XMM10c, XMM10d,
 999                       XMM11, XMM11b, XMM11c, XMM11d,
1000                       XMM12, XMM12b, XMM12c, XMM12d,
1001                       XMM13, XMM13b, XMM13c, XMM13d,
1002                       XMM14, XMM14b, XMM14c, XMM14d,
1003                       XMM15, XMM15b, XMM15c, XMM15d,
1004                       XMM16, XMM16b, XMM16c, XMM16d,
1005                       XMM17, XMM17b, XMM17c, XMM17d,
1006                       XMM18, XMM18b, XMM18c, XMM18d,
1007                       XMM19, XMM19b, XMM19c, XMM19d,
1008                       XMM20, XMM20b, XMM20c, XMM20d,
1009                       XMM21, XMM21b, XMM21c, XMM21d,
1010                       XMM22, XMM22b, XMM22c, XMM22d,
1011                       XMM23, XMM23b, XMM23c, XMM23d,
1012                       XMM24, XMM24b, XMM24c, XMM24d,
1013                       XMM25, XMM25b, XMM25c, XMM25d,
1014                       XMM26, XMM26b, XMM26c, XMM26d,
1015                       XMM27, XMM27b, XMM27c, XMM27d,
1016                       XMM28, XMM28b, XMM28c, XMM28d,
1017                       XMM29, XMM29b, XMM29c, XMM29d,
1018                       XMM30, XMM30b, XMM30c, XMM30d,
1019                       XMM31, XMM31b, XMM31c, XMM31d
1020 #endif
1021                       );
1022 
1023 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
1024 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1025 
1026 // Class for all 256bit vector registers
1027 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1028                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1029                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1030                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1031                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1032                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1033                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1034                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1035 #ifdef _LP64
1036                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1037                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1038                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1039                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1040                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1041                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1042                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1043                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
1044 #endif
1045                       );
1046 
1047 // Class for all 256bit vector registers
1048 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1049                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1050                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1051                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1052                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1053                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1054                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1055                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1056 #ifdef _LP64
1057                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1058                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1059                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1060                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1061                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1062                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1063                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1064                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1065                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1066                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1067                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1068                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1069                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1070                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1071                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1072                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1073                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1074                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1075                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1076                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1077                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1078                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1079                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1080                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1081 #endif
1082                       );
1083 
1084 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1085 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1086 
1087 // Class for all 512bit vector registers
1088 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1089                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1090                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1091                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1092                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1093                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1094                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1095                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1096 #ifdef _LP64
1097                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1098                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1099                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1100                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1101                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1102                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1103                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1104                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1105                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1106                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1107                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1108                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1109                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1110                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1111                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1112                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1113                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1114                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1115                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1116                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1117                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1118                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1119                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1120                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1121 #endif
1122                       );
1123 
1124 // Class for restricted 512bit vector registers
1125 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1126                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1127                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1128                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1129                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1130                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1131                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1132                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1133 #ifdef _LP64
1134                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1135                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1136                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1137                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1138                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1139                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1140                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1141                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1142 #endif
1143                       );
1144 
1145 reg_class_dynamic vectorz_reg   (vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1146 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1147 
1148 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1149 %}
1150 
1151 
1152 //----------SOURCE BLOCK-------------------------------------------------------
1153 // This is a block of C++ code which provides values, functions, and
1154 // definitions necessary in the rest of the architecture description
1155 
1156 source_hpp %{
1157 // Header information of the source block.
1158 // Method declarations/definitions which are used outside
1159 // the ad-scope can conveniently be defined here.
1160 //
1161 // To keep related declarations/definitions/uses close together,
1162 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1163 
1164 #include "runtime/vm_version.hpp"
1165 
1166 class NativeJump;
1167 
1168 class CallStubImpl {
1169 
1170   //--------------------------------------------------------------
1171   //---<  Used for optimization in Compile::shorten_branches  >---
1172   //--------------------------------------------------------------
1173 
1174  public:
1175   // Size of call trampoline stub.
1176   static uint size_call_trampoline() {
1177     return 0; // no call trampolines on this platform
1178   }
1179 
1180   // number of relocations needed by a call trampoline stub
1181   static uint reloc_call_trampoline() {
1182     return 0; // no call trampolines on this platform
1183   }
1184 };
1185 
1186 class HandlerImpl {
1187 
1188  public:
1189 
1190   static int emit_exception_handler(CodeBuffer &cbuf);
1191   static int emit_deopt_handler(CodeBuffer& cbuf);
1192 
1193   static uint size_exception_handler() {
1194     // NativeCall instruction size is the same as NativeJump.
1195     // exception handler starts out as jump and can be patched to
1196     // a call be deoptimization.  (4932387)
1197     // Note that this value is also credited (in output.cpp) to
1198     // the size of the code section.
1199     return NativeJump::instruction_size;
1200   }
1201 
1202 #ifdef _LP64
1203   static uint size_deopt_handler() {
1204     // three 5 byte instructions plus one move for unreachable address.
1205     return 15+3;
1206   }
1207 #else
1208   static uint size_deopt_handler() {
1209     // NativeCall instruction size is the same as NativeJump.
1210     // exception handler starts out as jump and can be patched to
1211     // a call be deoptimization.  (4932387)
1212     // Note that this value is also credited (in output.cpp) to
1213     // the size of the code section.
1214     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1215   }
1216 #endif
1217 };
1218 
1219 inline Assembler::AvxVectorLen vector_length_encoding(int bytes) {
1220   switch(bytes) {
1221     case  4: // fall-through
1222     case  8: // fall-through
1223     case 16: return Assembler::AVX_128bit;
1224     case 32: return Assembler::AVX_256bit;
1225     case 64: return Assembler::AVX_512bit;
1226 
1227     default: {
1228       ShouldNotReachHere();
1229       return Assembler::AVX_NoVec;
1230     }
1231   }
1232 }
1233 
1234 static inline Assembler::AvxVectorLen vector_length_encoding(const Node* n) {
1235   return vector_length_encoding(Matcher::vector_length_in_bytes(n));
1236 }
1237 
1238 static inline Assembler::AvxVectorLen vector_length_encoding(const MachNode* use, MachOper* opnd) {
1239   uint def_idx = use->operand_index(opnd);
1240   Node* def = use->in(def_idx);
1241   return vector_length_encoding(def);
1242 }
1243 
1244 static inline bool is_unsigned_booltest_pred(int bt) {
1245   return  ((bt & BoolTest::unsigned_compare) == BoolTest::unsigned_compare);
1246 }
1247 
1248 class Node::PD {
1249 public:
1250   enum NodeFlags {
1251     Flag_intel_jcc_erratum = Node::_last_flag << 1,
1252     _last_flag             = Flag_intel_jcc_erratum
1253   };
1254 };
1255 
1256 %} // end source_hpp
1257 
1258 source %{
1259 
1260 #include "opto/addnode.hpp"
1261 #include "c2_intelJccErratum_x86.hpp"
1262 
1263 void PhaseOutput::pd_perform_mach_node_analysis() {
1264   if (VM_Version::has_intel_jcc_erratum()) {
1265     int extra_padding = IntelJccErratum::tag_affected_machnodes(C, C->cfg(), C->regalloc());
1266     _buf_sizes._code += extra_padding;
1267   }
1268 }
1269 
1270 int MachNode::pd_alignment_required() const {
1271   if (VM_Version::has_intel_jcc_erratum() && IntelJccErratum::is_jcc_erratum_branch(this)) {
1272     // Conservatively add worst case padding. We assume that relocInfo::addr_unit() is 1 on x86.
1273     return IntelJccErratum::largest_jcc_size() + 1;
1274   } else {
1275     return 1;
1276   }
1277 }
1278 
1279 int MachNode::compute_padding(int current_offset) const {
1280   if (flags() & Node::PD::Flag_intel_jcc_erratum) {
1281     Compile* C = Compile::current();
1282     PhaseOutput* output = C->output();
1283     Block* block = output->block();
1284     int index = output->index();
1285     return IntelJccErratum::compute_padding(current_offset, this, block, index, C->regalloc());
1286   } else {
1287     return 0;
1288   }
1289 }
1290 
1291 // Emit exception handler code.
1292 // Stuff framesize into a register and call a VM stub routine.
1293 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1294 
1295   // Note that the code buffer's insts_mark is always relative to insts.
1296   // That's why we must use the macroassembler to generate a handler.
1297   C2_MacroAssembler _masm(&cbuf);
1298   address base = __ start_a_stub(size_exception_handler());
1299   if (base == NULL) {
1300     ciEnv::current()->record_failure("CodeCache is full");
1301     return 0;  // CodeBuffer::expand failed
1302   }
1303   int offset = __ offset();
1304   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1305   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1306   __ end_a_stub();
1307   return offset;
1308 }
1309 
1310 // Emit deopt handler code.
1311 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1312 
1313   // Note that the code buffer's insts_mark is always relative to insts.
1314   // That's why we must use the macroassembler to generate a handler.
1315   C2_MacroAssembler _masm(&cbuf);
1316   address base = __ start_a_stub(size_deopt_handler());
1317   if (base == NULL) {
1318     ciEnv::current()->record_failure("CodeCache is full");
1319     return 0;  // CodeBuffer::expand failed
1320   }
1321   int offset = __ offset();
1322 
1323 #ifdef _LP64
1324   address the_pc = (address) __ pc();
1325   Label next;
1326   // push a "the_pc" on the stack without destroying any registers
1327   // as they all may be live.
1328 
1329   // push address of "next"
1330   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1331   __ bind(next);
1332   // adjust it so it matches "the_pc"
1333   __ subptr(Address(rsp, 0), __ offset() - offset);
1334 #else
1335   InternalAddress here(__ pc());
1336   __ pushptr(here.addr());
1337 #endif
1338 
1339   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1340   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1341   __ end_a_stub();
1342   return offset;
1343 }
1344 
1345 Assembler::Width widthForType(BasicType bt) {
1346   if (bt == T_BYTE) {
1347     return Assembler::B;
1348   } else if (bt == T_SHORT) {
1349     return Assembler::W;
1350   } else if (bt == T_INT) {
1351     return Assembler::D;
1352   } else {
1353     assert(bt == T_LONG, "not a long: %s", type2name(bt));
1354     return Assembler::Q;
1355   }
1356 }
1357 
1358 //=============================================================================
1359 
1360   // Float masks come from different places depending on platform.
1361 #ifdef _LP64
1362   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1363   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1364   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1365   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1366 #else
1367   static address float_signmask()  { return (address)float_signmask_pool; }
1368   static address float_signflip()  { return (address)float_signflip_pool; }
1369   static address double_signmask() { return (address)double_signmask_pool; }
1370   static address double_signflip() { return (address)double_signflip_pool; }
1371 #endif
1372   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1373   static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1374   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1375   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1376   static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1377   static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1378   static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
1379   static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1380   static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1381   static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1382   static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1383   static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1384 
1385 //=============================================================================
1386 const bool Matcher::match_rule_supported(int opcode) {
1387   if (!has_match_rule(opcode)) {
1388     return false; // no match rule present
1389   }
1390   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1391   switch (opcode) {
1392     case Op_AbsVL:
1393     case Op_StoreVectorScatter:
1394       if (UseAVX < 3) {
1395         return false;
1396       }
1397       break;
1398     case Op_PopCountI:
1399     case Op_PopCountL:
1400       if (!UsePopCountInstruction) {
1401         return false;
1402       }
1403       break;
1404     case Op_PopCountVI:
1405       if (!UsePopCountInstruction || !VM_Version::supports_avx512_vpopcntdq()) {
1406         return false;
1407       }
1408       break;
1409     case Op_MulVI:
1410       if ((UseSSE < 4) && (UseAVX < 1)) { // only with SSE4_1 or AVX
1411         return false;
1412       }
1413       break;
1414     case Op_MulVL:
1415       if (UseSSE < 4) { // only with SSE4_1 or AVX
1416         return false;
1417       }
1418       break;
1419     case Op_MulReductionVL:
1420       if (VM_Version::supports_avx512dq() == false) {
1421         return false;
1422       }
1423       break;
1424     case Op_AddReductionVL:
1425       if (UseSSE < 2) { // requires at least SSE2
1426         return false;
1427       }
1428       break;
1429     case Op_AbsVB:
1430     case Op_AbsVS:
1431     case Op_AbsVI:
1432     case Op_AddReductionVI:
1433     case Op_AndReductionV:
1434     case Op_OrReductionV:
1435     case Op_XorReductionV:
1436       if (UseSSE < 3) { // requires at least SSSE3
1437         return false;
1438       }
1439       break;
1440     case Op_VectorLoadShuffle:
1441     case Op_VectorRearrange:
1442     case Op_MulReductionVI:
1443       if (UseSSE < 4) { // requires at least SSE4
1444         return false;
1445       }
1446       break;
1447     case Op_SqrtVD:
1448     case Op_SqrtVF:
1449     case Op_VectorMaskCmp:
1450     case Op_VectorCastB2X:
1451     case Op_VectorCastS2X:
1452     case Op_VectorCastI2X:
1453     case Op_VectorCastL2X:
1454     case Op_VectorCastF2X:
1455     case Op_VectorCastD2X:
1456       if (UseAVX < 1) { // enabled for AVX only
1457         return false;
1458       }
1459       break;
1460     case Op_CompareAndSwapL:
1461 #ifdef _LP64
1462     case Op_CompareAndSwapP:
1463 #endif
1464       if (!VM_Version::supports_cx8()) {
1465         return false;
1466       }
1467       break;
1468     case Op_CMoveVF:
1469     case Op_CMoveVD:
1470       if (UseAVX < 1) { // enabled for AVX only
1471         return false;
1472       }
1473       break;
1474     case Op_StrIndexOf:
1475       if (!UseSSE42Intrinsics) {
1476         return false;
1477       }
1478       break;
1479     case Op_StrIndexOfChar:
1480       if (!UseSSE42Intrinsics) {
1481         return false;
1482       }
1483       break;
1484     case Op_OnSpinWait:
1485       if (VM_Version::supports_on_spin_wait() == false) {
1486         return false;
1487       }
1488       break;
1489     case Op_MulVB:
1490     case Op_LShiftVB:
1491     case Op_RShiftVB:
1492     case Op_URShiftVB:
1493     case Op_VectorInsert:
1494     case Op_VectorLoadMask:
1495     case Op_VectorStoreMask:
1496     case Op_VectorBlend:
1497       if (UseSSE < 4) {
1498         return false;
1499       }
1500       break;
1501 #ifdef _LP64
1502     case Op_MaxD:
1503     case Op_MaxF:
1504     case Op_MinD:
1505     case Op_MinF:
1506       if (UseAVX < 1) { // enabled for AVX only
1507         return false;
1508       }
1509       break;
1510 #endif
1511     case Op_CacheWB:
1512     case Op_CacheWBPreSync:
1513     case Op_CacheWBPostSync:
1514       if (!VM_Version::supports_data_cache_line_flush()) {
1515         return false;
1516       }
1517       break;
1518     case Op_ExtractB:
1519     case Op_ExtractL:
1520     case Op_ExtractI:
1521     case Op_RoundDoubleMode:
1522       if (UseSSE < 4) {
1523         return false;
1524       }
1525       break;
1526     case Op_RoundDoubleModeV:
1527       if (VM_Version::supports_avx() == false) {
1528         return false; // 128bit vroundpd is not available
1529       }
1530       break;
1531     case Op_LoadVectorGather:
1532       if (UseAVX < 2) {
1533         return false;
1534       }
1535       break;
1536     case Op_FmaVD:
1537     case Op_FmaVF:
1538       if (!UseFMA) {
1539         return false;
1540       }
1541       break;
1542     case Op_MacroLogicV:
1543       if (UseAVX < 3 || !UseVectorMacroLogic) {
1544         return false;
1545       }
1546       break;
1547 
1548     case Op_VectorCmpMasked:
1549     case Op_VectorMaskGen:
1550     case Op_LoadVectorMasked:
1551     case Op_StoreVectorMasked:
1552       if (!is_LP64  || UseAVX < 3 || !VM_Version::supports_bmi2()) {
1553         return false;
1554       }
1555       break;
1556     case Op_VectorMaskFirstTrue:
1557     case Op_VectorMaskLastTrue:
1558     case Op_VectorMaskTrueCount:
1559       if (!is_LP64 || UseAVX < 1) {
1560          return false;
1561       }
1562       break;
1563     case Op_CopySignD:
1564     case Op_CopySignF:
1565       if (UseAVX < 3 || !is_LP64)  {
1566         return false;
1567       }
1568       if (!VM_Version::supports_avx512vl()) {
1569         return false;
1570       }
1571       break;
1572 #ifndef _LP64
1573     case Op_AddReductionVF:
1574     case Op_AddReductionVD:
1575     case Op_MulReductionVF:
1576     case Op_MulReductionVD:
1577       if (UseSSE < 1) { // requires at least SSE
1578         return false;
1579       }
1580       break;
1581     case Op_MulAddVS2VI:
1582     case Op_RShiftVL:
1583     case Op_AbsVD:
1584     case Op_NegVD:
1585       if (UseSSE < 2) {
1586         return false;
1587       }
1588       break;
1589 #endif // !LP64
1590     case Op_SignumF:
1591       if (UseSSE < 1) {
1592         return false;
1593       }
1594       break;
1595     case Op_SignumD:
1596       if (UseSSE < 2) {
1597         return false;
1598       }
1599       break;
1600   }
1601   return true;  // Match rules are supported by default.
1602 }
1603 
1604 //------------------------------------------------------------------------
1605 
1606 // Identify extra cases that we might want to provide match rules for vector nodes and
1607 // other intrinsics guarded with vector length (vlen) and element type (bt).
1608 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
1609   const bool is_LP64 = LP64_ONLY(true) NOT_LP64(false);
1610   if (!match_rule_supported(opcode)) {
1611     return false;
1612   }
1613   // Matcher::vector_size_supported() restricts vector sizes in the following way (see Matcher::vector_width_in_bytes):
1614   //   * SSE2 supports 128bit vectors for all types;
1615   //   * AVX1 supports 256bit vectors only for FLOAT and DOUBLE types;
1616   //   * AVX2 supports 256bit vectors for all types;
1617   //   * AVX512F supports 512bit vectors only for INT, FLOAT, and DOUBLE types;
1618   //   * AVX512BW supports 512bit vectors for BYTE, SHORT, and CHAR types.
1619   // There's also a limit on minimum vector size supported: 2 elements (or 4 bytes for BYTE).
1620   // And MaxVectorSize is taken into account as well.
1621   if (!vector_size_supported(bt, vlen)) {
1622     return false;
1623   }
1624   // Special cases which require vector length follow:
1625   //   * implementation limitations
1626   //   * some 512bit vector operations on FLOAT and DOUBLE types require AVX512DQ
1627   //   * 128bit vroundpd instruction is present only in AVX1
1628   int size_in_bits = vlen * type2aelembytes(bt) * BitsPerByte;
1629   switch (opcode) {
1630     case Op_AbsVF:
1631     case Op_NegVF:
1632       if ((vlen == 16) && (VM_Version::supports_avx512dq() == false)) {
1633         return false; // 512bit vandps and vxorps are not available
1634       }
1635       break;
1636     case Op_AbsVD:
1637     case Op_NegVD:
1638     case Op_MulVL:
1639       if ((vlen == 8) && (VM_Version::supports_avx512dq() == false)) {
1640         return false; // 512bit vpmullq, vandpd and vxorpd are not available
1641       }
1642       break;
1643     case Op_CMoveVF:
1644       if (vlen != 8) {
1645         return false; // implementation limitation (only vcmov8F_reg is present)
1646       }
1647       break;
1648     case Op_RotateRightV:
1649     case Op_RotateLeftV:
1650       if (bt != T_INT && bt != T_LONG) {
1651         return false;
1652       } // fallthrough
1653     case Op_MacroLogicV:
1654       if (!VM_Version::supports_evex() ||
1655           ((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
1656         return false;
1657       }
1658       break;
1659     case Op_ClearArray:
1660     case Op_VectorMaskGen:
1661     case Op_VectorCmpMasked:
1662     case Op_LoadVectorMasked:
1663     case Op_StoreVectorMasked:
1664       if (!is_LP64 || !VM_Version::supports_avx512bw()) {
1665         return false;
1666       }
1667       if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
1668         return false;
1669       }
1670       break;
1671     case Op_CMoveVD:
1672       if (vlen != 4) {
1673         return false; // implementation limitation (only vcmov4D_reg is present)
1674       }
1675       break;
1676     case Op_MaxV:
1677     case Op_MinV:
1678       if (UseSSE < 4 && is_integral_type(bt)) {
1679         return false;
1680       }
1681       if ((bt == T_FLOAT || bt == T_DOUBLE)) {
1682           // Float/Double intrinsics are enabled for AVX family currently.
1683           if (UseAVX == 0) {
1684             return false;
1685           }
1686           if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) { // 512 bit Float/Double intrinsics need AVX512DQ
1687             return false;
1688           }
1689       }
1690       break;
1691     case Op_CallLeafVector:
1692       if (size_in_bits == 512 && !VM_Version::supports_avx512vlbwdq()) {
1693         return false;
1694       }
1695       break;
1696     case Op_AddReductionVI:
1697       if (bt == T_INT && (UseSSE < 3 || !VM_Version::supports_ssse3())) {
1698         return false;
1699       }
1700       // fallthrough
1701     case Op_AndReductionV:
1702     case Op_OrReductionV:
1703     case Op_XorReductionV:
1704       if (is_subword_type(bt) && (UseSSE < 4)) {
1705         return false;
1706       }
1707 #ifndef _LP64
1708       if (bt == T_BYTE || bt == T_LONG) {
1709         return false;
1710       }
1711 #endif
1712       break;
1713 #ifndef _LP64
1714     case Op_VectorInsert:
1715       if (bt == T_LONG || bt == T_DOUBLE) {
1716         return false;
1717       }
1718       break;
1719 #endif
1720     case Op_MinReductionV:
1721     case Op_MaxReductionV:
1722       if ((bt == T_INT || is_subword_type(bt)) && UseSSE < 4) {
1723         return false;
1724       } else if (bt == T_LONG && (UseAVX < 3 || !VM_Version::supports_avx512vlbwdq())) {
1725         return false;
1726       }
1727       // Float/Double intrinsics enabled for AVX family.
1728       if (UseAVX == 0 && (bt == T_FLOAT || bt == T_DOUBLE)) {
1729         return false;
1730       }
1731       if (UseAVX > 2 && (!VM_Version::supports_avx512dq() && size_in_bits == 512)) {
1732         return false;
1733       }
1734 #ifndef _LP64
1735       if (bt == T_BYTE || bt == T_LONG) {
1736         return false;
1737       }
1738 #endif
1739       break;
1740     case Op_VectorTest:
1741       if (UseSSE < 4) {
1742         return false; // Implementation limitation
1743       } else if (size_in_bits < 32) {
1744         return false; // Implementation limitation
1745       } else if (size_in_bits == 512 && (VM_Version::supports_avx512bw() == false)) {
1746         return false; // Implementation limitation
1747       }
1748       break;
1749     case Op_VectorLoadShuffle:
1750     case Op_VectorRearrange:
1751       if(vlen == 2) {
1752         return false; // Implementation limitation due to how shuffle is loaded
1753       } else if (size_in_bits == 256 && UseAVX < 2) {
1754         return false; // Implementation limitation
1755       } else if (bt == T_BYTE && size_in_bits > 256 && !VM_Version::supports_avx512_vbmi())  {
1756         return false; // Implementation limitation
1757       } else if (bt == T_SHORT && size_in_bits > 256 && !VM_Version::supports_avx512bw())  {
1758         return false; // Implementation limitation
1759       }
1760       break;
1761     case Op_VectorLoadMask:
1762       if (size_in_bits == 256 && UseAVX < 2) {
1763         return false; // Implementation limitation
1764       }
1765       // fallthrough
1766     case Op_VectorStoreMask:
1767       if (vlen == 2) {
1768         return false; // Implementation limitation
1769       }
1770       break;
1771     case Op_VectorCastB2X:
1772       if (size_in_bits == 256 && UseAVX < 2) {
1773         return false; // Implementation limitation
1774       }
1775       break;
1776     case Op_VectorCastS2X:
1777       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1778         return false;
1779       }
1780       break;
1781     case Op_VectorCastI2X:
1782       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1783         return false;
1784       }
1785       break;
1786     case Op_VectorCastL2X:
1787       if (is_integral_type(bt) && size_in_bits == 256 && UseAVX < 2) {
1788         return false;
1789       } else if (!is_integral_type(bt) && !VM_Version::supports_avx512dq()) {
1790         return false;
1791       }
1792       break;
1793     case Op_VectorCastF2X:
1794     case Op_VectorCastD2X:
1795       if (is_integral_type(bt)) {
1796         // Casts from FP to integral types require special fixup logic not easily
1797         // implementable with vectors.
1798         return false; // Implementation limitation
1799       }
1800     case Op_MulReductionVI:
1801       if (bt == T_BYTE && size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
1802         return false;
1803       }
1804       break;
1805     case Op_StoreVectorScatter:
1806       if(bt == T_BYTE || bt == T_SHORT) {
1807         return false;
1808       } else if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
1809         return false;
1810       }
1811       // fallthrough
1812     case Op_LoadVectorGather:
1813       if (size_in_bits == 64 ) {
1814         return false;
1815       }
1816       break;
1817     case Op_VectorMaskCmp:
1818       if (vlen < 2 || size_in_bits < 32) {
1819         return false;
1820       }
1821       break;
1822   }
1823   return true;  // Per default match rules are supported.
1824 }
1825 
1826 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
1827   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
1828   bool legacy = (generic_opnd->opcode() == LEGVEC);
1829   if (!VM_Version::supports_avx512vlbwdq() && // KNL
1830       is_temp && !legacy && (ideal_reg == Op_VecZ)) {
1831     // Conservatively specialize 512bit vec TEMP operands to legVecZ (zmm0-15) on KNL.
1832     return new legVecZOper();
1833   }
1834   if (legacy) {
1835     switch (ideal_reg) {
1836       case Op_VecS: return new legVecSOper();
1837       case Op_VecD: return new legVecDOper();
1838       case Op_VecX: return new legVecXOper();
1839       case Op_VecY: return new legVecYOper();
1840       case Op_VecZ: return new legVecZOper();
1841     }
1842   } else {
1843     switch (ideal_reg) {
1844       case Op_VecS: return new vecSOper();
1845       case Op_VecD: return new vecDOper();
1846       case Op_VecX: return new vecXOper();
1847       case Op_VecY: return new vecYOper();
1848       case Op_VecZ: return new vecZOper();
1849     }
1850   }
1851   ShouldNotReachHere();
1852   return NULL;
1853 }
1854 
1855 bool Matcher::is_reg2reg_move(MachNode* m) {
1856   switch (m->rule()) {
1857     case MoveVec2Leg_rule:
1858     case MoveLeg2Vec_rule:
1859     case MoveF2VL_rule:
1860     case MoveF2LEG_rule:
1861     case MoveVL2F_rule:
1862     case MoveLEG2F_rule:
1863     case MoveD2VL_rule:
1864     case MoveD2LEG_rule:
1865     case MoveVL2D_rule:
1866     case MoveLEG2D_rule:
1867       return true;
1868     default:
1869       return false;
1870   }
1871 }
1872 
1873 bool Matcher::is_generic_vector(MachOper* opnd) {
1874   switch (opnd->opcode()) {
1875     case VEC:
1876     case LEGVEC:
1877       return true;
1878     default:
1879       return false;
1880   }
1881 }
1882 
1883 //------------------------------------------------------------------------
1884 
1885 const RegMask* Matcher::predicate_reg_mask(void) {
1886   return &_VECTMASK_REG_mask;
1887 }
1888 
1889 const TypeVect* Matcher::predicate_reg_type(const Type* elemTy, int length) {
1890   return new TypeVectMask(TypeInt::BOOL, length);
1891 }
1892 
1893 // Max vector size in bytes. 0 if not supported.
1894 const int Matcher::vector_width_in_bytes(BasicType bt) {
1895   assert(is_java_primitive(bt), "only primitive type vectors");
1896   if (UseSSE < 2) return 0;
1897   // SSE2 supports 128bit vectors for all types.
1898   // AVX2 supports 256bit vectors for all types.
1899   // AVX2/EVEX supports 512bit vectors for all types.
1900   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1901   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1902   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1903     size = (UseAVX > 2) ? 64 : 32;
1904   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
1905     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
1906   // Use flag to limit vector size.
1907   size = MIN2(size,(int)MaxVectorSize);
1908   // Minimum 2 values in vector (or 4 for bytes).
1909   switch (bt) {
1910   case T_DOUBLE:
1911   case T_LONG:
1912     if (size < 16) return 0;
1913     break;
1914   case T_FLOAT:
1915   case T_INT:
1916     if (size < 8) return 0;
1917     break;
1918   case T_BOOLEAN:
1919     if (size < 4) return 0;
1920     break;
1921   case T_CHAR:
1922     if (size < 4) return 0;
1923     break;
1924   case T_BYTE:
1925     if (size < 4) return 0;
1926     break;
1927   case T_SHORT:
1928     if (size < 4) return 0;
1929     break;
1930   default:
1931     ShouldNotReachHere();
1932   }
1933   return size;
1934 }
1935 
1936 // Limits on vector size (number of elements) loaded into vector.
1937 const int Matcher::max_vector_size(const BasicType bt) {
1938   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1939 }
1940 const int Matcher::min_vector_size(const BasicType bt) {
1941   int max_size = max_vector_size(bt);
1942   // Min size which can be loaded into vector is 4 bytes.
1943   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1944   // Support for calling svml double64 vectors
1945   if (bt == T_DOUBLE) {
1946     size = 1;
1947   }
1948   return MIN2(size,max_size);
1949 }
1950 
1951 const int Matcher::scalable_vector_reg_size(const BasicType bt) {
1952   return -1;
1953 }
1954 
1955 // Vector ideal reg corresponding to specified size in bytes
1956 const uint Matcher::vector_ideal_reg(int size) {
1957   assert(MaxVectorSize >= size, "");
1958   switch(size) {
1959     case  4: return Op_VecS;
1960     case  8: return Op_VecD;
1961     case 16: return Op_VecX;
1962     case 32: return Op_VecY;
1963     case 64: return Op_VecZ;
1964   }
1965   ShouldNotReachHere();
1966   return 0;
1967 }
1968 
1969 // Check for shift by small constant as well
1970 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1971   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1972       shift->in(2)->get_int() <= 3 &&
1973       // Are there other uses besides address expressions?
1974       !matcher->is_visited(shift)) {
1975     address_visited.set(shift->_idx); // Flag as address_visited
1976     mstack.push(shift->in(2), Matcher::Visit);
1977     Node *conv = shift->in(1);
1978 #ifdef _LP64
1979     // Allow Matcher to match the rule which bypass
1980     // ConvI2L operation for an array index on LP64
1981     // if the index value is positive.
1982     if (conv->Opcode() == Op_ConvI2L &&
1983         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1984         // Are there other uses besides address expressions?
1985         !matcher->is_visited(conv)) {
1986       address_visited.set(conv->_idx); // Flag as address_visited
1987       mstack.push(conv->in(1), Matcher::Pre_Visit);
1988     } else
1989 #endif
1990       mstack.push(conv, Matcher::Pre_Visit);
1991     return true;
1992   }
1993   return false;
1994 }
1995 
1996 // This function identifies sub-graphs in which a 'load' node is
1997 // input to two different nodes, and such that it can be matched
1998 // with BMI instructions like blsi, blsr, etc.
1999 // Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
2000 // The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
2001 // refers to the same node.
2002 //
2003 // Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
2004 // This is a temporary solution until we make DAGs expressible in ADL.
2005 template<typename ConType>
2006 class FusedPatternMatcher {
2007   Node* _op1_node;
2008   Node* _mop_node;
2009   int _con_op;
2010 
2011   static int match_next(Node* n, int next_op, int next_op_idx) {
2012     if (n->in(1) == NULL || n->in(2) == NULL) {
2013       return -1;
2014     }
2015 
2016     if (next_op_idx == -1) { // n is commutative, try rotations
2017       if (n->in(1)->Opcode() == next_op) {
2018         return 1;
2019       } else if (n->in(2)->Opcode() == next_op) {
2020         return 2;
2021       }
2022     } else {
2023       assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
2024       if (n->in(next_op_idx)->Opcode() == next_op) {
2025         return next_op_idx;
2026       }
2027     }
2028     return -1;
2029   }
2030 
2031  public:
2032   FusedPatternMatcher(Node* op1_node, Node* mop_node, int con_op) :
2033     _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
2034 
2035   bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
2036              int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
2037              typename ConType::NativeType con_value) {
2038     if (_op1_node->Opcode() != op1) {
2039       return false;
2040     }
2041     if (_mop_node->outcnt() > 2) {
2042       return false;
2043     }
2044     op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
2045     if (op1_op2_idx == -1) {
2046       return false;
2047     }
2048     // Memory operation must be the other edge
2049     int op1_mop_idx = (op1_op2_idx & 1) + 1;
2050 
2051     // Check that the mop node is really what we want
2052     if (_op1_node->in(op1_mop_idx) == _mop_node) {
2053       Node* op2_node = _op1_node->in(op1_op2_idx);
2054       if (op2_node->outcnt() > 1) {
2055         return false;
2056       }
2057       assert(op2_node->Opcode() == op2, "Should be");
2058       op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
2059       if (op2_con_idx == -1) {
2060         return false;
2061       }
2062       // Memory operation must be the other edge
2063       int op2_mop_idx = (op2_con_idx & 1) + 1;
2064       // Check that the memory operation is the same node
2065       if (op2_node->in(op2_mop_idx) == _mop_node) {
2066         // Now check the constant
2067         const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
2068         if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
2069           return true;
2070         }
2071       }
2072     }
2073     return false;
2074   }
2075 };
2076 
2077 static bool is_bmi_pattern(Node* n, Node* m) {
2078   assert(UseBMI1Instructions, "sanity");
2079   if (n != NULL && m != NULL) {
2080     if (m->Opcode() == Op_LoadI) {
2081       FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
2082       return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
2083              bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
2084              bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
2085     } else if (m->Opcode() == Op_LoadL) {
2086       FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
2087       return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
2088              bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
2089              bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
2090     }
2091   }
2092   return false;
2093 }
2094 
2095 // Should the matcher clone input 'm' of node 'n'?
2096 bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
2097   // If 'n' and 'm' are part of a graph for BMI instruction, clone the input 'm'.
2098   if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
2099     mstack.push(m, Visit);
2100     return true;
2101   }
2102   if (is_vshift_con_pattern(n, m)) { // ShiftV src (ShiftCntV con)
2103     mstack.push(m, Visit);           // m = ShiftCntV
2104     return true;
2105   }
2106   return false;
2107 }
2108 
2109 // Should the Matcher clone shifts on addressing modes, expecting them
2110 // to be subsumed into complex addressing expressions or compute them
2111 // into registers?
2112 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
2113   Node *off = m->in(AddPNode::Offset);
2114   if (off->is_Con()) {
2115     address_visited.test_set(m->_idx); // Flag as address_visited
2116     Node *adr = m->in(AddPNode::Address);
2117 
2118     // Intel can handle 2 adds in addressing mode
2119     // AtomicAdd is not an addressing expression.
2120     // Cheap to find it by looking for screwy base.
2121     if (adr->is_AddP() &&
2122         !adr->in(AddPNode::Base)->is_top() &&
2123         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
2124         // Are there other uses besides address expressions?
2125         !is_visited(adr)) {
2126       address_visited.set(adr->_idx); // Flag as address_visited
2127       Node *shift = adr->in(AddPNode::Offset);
2128       if (!clone_shift(shift, this, mstack, address_visited)) {
2129         mstack.push(shift, Pre_Visit);
2130       }
2131       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
2132       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
2133     } else {
2134       mstack.push(adr, Pre_Visit);
2135     }
2136 
2137     // Clone X+offset as it also folds into most addressing expressions
2138     mstack.push(off, Visit);
2139     mstack.push(m->in(AddPNode::Base), Pre_Visit);
2140     return true;
2141   } else if (clone_shift(off, this, mstack, address_visited)) {
2142     address_visited.test_set(m->_idx); // Flag as address_visited
2143     mstack.push(m->in(AddPNode::Address), Pre_Visit);
2144     mstack.push(m->in(AddPNode::Base), Pre_Visit);
2145     return true;
2146   }
2147   return false;
2148 }
2149 
2150 static inline Assembler::ComparisonPredicate booltest_pred_to_comparison_pred(int bt) {
2151   switch (bt) {
2152     case BoolTest::eq:
2153       return Assembler::eq;
2154     case BoolTest::ne:
2155       return Assembler::neq;
2156     case BoolTest::le:
2157     case BoolTest::ule:
2158       return Assembler::le;
2159     case BoolTest::ge:
2160     case BoolTest::uge:
2161       return Assembler::nlt;
2162     case BoolTest::lt:
2163     case BoolTest::ult:
2164       return Assembler::lt;
2165     case BoolTest::gt:
2166     case BoolTest::ugt:
2167       return Assembler::nle;
2168     default : ShouldNotReachHere(); return Assembler::_false;
2169   }
2170 }
2171 
2172 static inline Assembler::ComparisonPredicateFP booltest_pred_to_comparison_pred_fp(int bt) {
2173   switch (bt) {
2174   case BoolTest::eq: return Assembler::EQ_OQ;  // ordered non-signaling
2175   // As per JLS 15.21.1, != of NaNs is true. Thus use unordered compare.
2176   case BoolTest::ne: return Assembler::NEQ_UQ; // unordered non-signaling
2177   case BoolTest::le: return Assembler::LE_OQ;  // ordered non-signaling
2178   case BoolTest::ge: return Assembler::GE_OQ;  // ordered non-signaling
2179   case BoolTest::lt: return Assembler::LT_OQ;  // ordered non-signaling
2180   case BoolTest::gt: return Assembler::GT_OQ;  // ordered non-signaling
2181   default: ShouldNotReachHere(); return Assembler::FALSE_OS;
2182   }
2183 }
2184 
2185 // Helper methods for MachSpillCopyNode::implementation().
2186 static void vec_mov_helper(CodeBuffer *cbuf, int src_lo, int dst_lo,
2187                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
2188   assert(ireg == Op_VecS || // 32bit vector
2189          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
2190          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
2191          "no non-adjacent vector moves" );
2192   if (cbuf) {
2193     C2_MacroAssembler _masm(cbuf);
2194     switch (ireg) {
2195     case Op_VecS: // copy whole register
2196     case Op_VecD:
2197     case Op_VecX:
2198 #ifndef _LP64
2199       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2200 #else
2201       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2202         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2203       } else {
2204         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2205      }
2206 #endif
2207       break;
2208     case Op_VecY:
2209 #ifndef _LP64
2210       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2211 #else
2212       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2213         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
2214       } else {
2215         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
2216      }
2217 #endif
2218       break;
2219     case Op_VecZ:
2220       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
2221       break;
2222     default:
2223       ShouldNotReachHere();
2224     }
2225 #ifndef PRODUCT
2226   } else {
2227     switch (ireg) {
2228     case Op_VecS:
2229     case Op_VecD:
2230     case Op_VecX:
2231       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2232       break;
2233     case Op_VecY:
2234     case Op_VecZ:
2235       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
2236       break;
2237     default:
2238       ShouldNotReachHere();
2239     }
2240 #endif
2241   }
2242 }
2243 
2244 void vec_spill_helper(CodeBuffer *cbuf, bool is_load,
2245                      int stack_offset, int reg, uint ireg, outputStream* st) {
2246   if (cbuf) {
2247     C2_MacroAssembler _masm(cbuf);
2248     if (is_load) {
2249       switch (ireg) {
2250       case Op_VecS:
2251         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2252         break;
2253       case Op_VecD:
2254         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2255         break;
2256       case Op_VecX:
2257 #ifndef _LP64
2258         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2259 #else
2260         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2261           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2262         } else {
2263           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2264           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2265         }
2266 #endif
2267         break;
2268       case Op_VecY:
2269 #ifndef _LP64
2270         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2271 #else
2272         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2273           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
2274         } else {
2275           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2276           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
2277         }
2278 #endif
2279         break;
2280       case Op_VecZ:
2281         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
2282         break;
2283       default:
2284         ShouldNotReachHere();
2285       }
2286     } else { // store
2287       switch (ireg) {
2288       case Op_VecS:
2289         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2290         break;
2291       case Op_VecD:
2292         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2293         break;
2294       case Op_VecX:
2295 #ifndef _LP64
2296         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2297 #else
2298         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2299           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2300         }
2301         else {
2302           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2303         }
2304 #endif
2305         break;
2306       case Op_VecY:
2307 #ifndef _LP64
2308         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2309 #else
2310         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
2311           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
2312         }
2313         else {
2314           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
2315         }
2316 #endif
2317         break;
2318       case Op_VecZ:
2319         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
2320         break;
2321       default:
2322         ShouldNotReachHere();
2323       }
2324     }
2325 #ifndef PRODUCT
2326   } else {
2327     if (is_load) {
2328       switch (ireg) {
2329       case Op_VecS:
2330         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2331         break;
2332       case Op_VecD:
2333         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2334         break;
2335        case Op_VecX:
2336         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2337         break;
2338       case Op_VecY:
2339       case Op_VecZ:
2340         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
2341         break;
2342       default:
2343         ShouldNotReachHere();
2344       }
2345     } else { // store
2346       switch (ireg) {
2347       case Op_VecS:
2348         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2349         break;
2350       case Op_VecD:
2351         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2352         break;
2353        case Op_VecX:
2354         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2355         break;
2356       case Op_VecY:
2357       case Op_VecZ:
2358         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
2359         break;
2360       default:
2361         ShouldNotReachHere();
2362       }
2363     }
2364 #endif
2365   }
2366 }
2367 
2368 static inline jlong replicate8_imm(int con, int width) {
2369   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
2370   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
2371   int bit_width = width * 8;
2372   jlong val = con;
2373   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
2374   while(bit_width < 64) {
2375     val |= (val << bit_width);
2376     bit_width <<= 1;
2377   }
2378   return val;
2379 }
2380 
2381 #ifndef PRODUCT
2382   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
2383     st->print("nop \t# %d bytes pad for loops and calls", _count);
2384   }
2385 #endif
2386 
2387   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
2388     C2_MacroAssembler _masm(&cbuf);
2389     __ nop(_count);
2390   }
2391 
2392   uint MachNopNode::size(PhaseRegAlloc*) const {
2393     return _count;
2394   }
2395 
2396 #ifndef PRODUCT
2397   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2398     st->print("# breakpoint");
2399   }
2400 #endif
2401 
2402   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
2403     C2_MacroAssembler _masm(&cbuf);
2404     __ int3();
2405   }
2406 
2407   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2408     return MachNode::size(ra_);
2409   }
2410 
2411 %}
2412 
2413 encode %{
2414 
2415   enc_class call_epilog %{
2416     C2_MacroAssembler _masm(&cbuf);
2417     if (VerifyStackAtCalls) {
2418       // Check that stack depth is unchanged: find majik cookie on stack
2419       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2420       Label L;
2421       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2422       __ jccb(Assembler::equal, L);
2423       // Die if stack mismatch
2424       __ int3();
2425       __ bind(L);
2426     }
2427     __ oopmap_metadata(-1);
2428   %}
2429 
2430 %}
2431 
2432 // Operands for bound floating pointer register arguments
2433 operand rxmm0() %{
2434   constraint(ALLOC_IN_RC(xmm0_reg));
2435   match(VecX);
2436   format%{%}
2437   interface(REG_INTER);
2438 %}
2439 
2440 //----------OPERANDS-----------------------------------------------------------
2441 // Operand definitions must precede instruction definitions for correct parsing
2442 // in the ADLC because operands constitute user defined types which are used in
2443 // instruction definitions.
2444 
2445 // Vectors
2446 
2447 // Dummy generic vector class. Should be used for all vector operands.
2448 // Replaced with vec[SDXYZ] during post-selection pass.
2449 operand vec() %{
2450   constraint(ALLOC_IN_RC(dynamic));
2451   match(VecX);
2452   match(VecY);
2453   match(VecZ);
2454   match(VecS);
2455   match(VecD);
2456 
2457   format %{ %}
2458   interface(REG_INTER);
2459 %}
2460 
2461 // Dummy generic legacy vector class. Should be used for all legacy vector operands.
2462 // Replaced with legVec[SDXYZ] during post-selection cleanup.
2463 // Note: legacy register class is used to avoid extra (unneeded in 32-bit VM)
2464 // runtime code generation via reg_class_dynamic.
2465 operand legVec() %{
2466   constraint(ALLOC_IN_RC(dynamic));
2467   match(VecX);
2468   match(VecY);
2469   match(VecZ);
2470   match(VecS);
2471   match(VecD);
2472 
2473   format %{ %}
2474   interface(REG_INTER);
2475 %}
2476 
2477 // Replaces vec during post-selection cleanup. See above.
2478 operand vecS() %{
2479   constraint(ALLOC_IN_RC(vectors_reg_vlbwdq));
2480   match(VecS);
2481 
2482   format %{ %}
2483   interface(REG_INTER);
2484 %}
2485 
2486 // Replaces legVec during post-selection cleanup. See above.
2487 operand legVecS() %{
2488   constraint(ALLOC_IN_RC(vectors_reg_legacy));
2489   match(VecS);
2490 
2491   format %{ %}
2492   interface(REG_INTER);
2493 %}
2494 
2495 // Replaces vec during post-selection cleanup. See above.
2496 operand vecD() %{
2497   constraint(ALLOC_IN_RC(vectord_reg_vlbwdq));
2498   match(VecD);
2499 
2500   format %{ %}
2501   interface(REG_INTER);
2502 %}
2503 
2504 // Replaces legVec during post-selection cleanup. See above.
2505 operand legVecD() %{
2506   constraint(ALLOC_IN_RC(vectord_reg_legacy));
2507   match(VecD);
2508 
2509   format %{ %}
2510   interface(REG_INTER);
2511 %}
2512 
2513 // Replaces vec during post-selection cleanup. See above.
2514 operand vecX() %{
2515   constraint(ALLOC_IN_RC(vectorx_reg_vlbwdq));
2516   match(VecX);
2517 
2518   format %{ %}
2519   interface(REG_INTER);
2520 %}
2521 
2522 // Replaces legVec during post-selection cleanup. See above.
2523 operand legVecX() %{
2524   constraint(ALLOC_IN_RC(vectorx_reg_legacy));
2525   match(VecX);
2526 
2527   format %{ %}
2528   interface(REG_INTER);
2529 %}
2530 
2531 // Replaces vec during post-selection cleanup. See above.
2532 operand vecY() %{
2533   constraint(ALLOC_IN_RC(vectory_reg_vlbwdq));
2534   match(VecY);
2535 
2536   format %{ %}
2537   interface(REG_INTER);
2538 %}
2539 
2540 // Replaces legVec during post-selection cleanup. See above.
2541 operand legVecY() %{
2542   constraint(ALLOC_IN_RC(vectory_reg_legacy));
2543   match(VecY);
2544 
2545   format %{ %}
2546   interface(REG_INTER);
2547 %}
2548 
2549 // Replaces vec during post-selection cleanup. See above.
2550 operand vecZ() %{
2551   constraint(ALLOC_IN_RC(vectorz_reg));
2552   match(VecZ);
2553 
2554   format %{ %}
2555   interface(REG_INTER);
2556 %}
2557 
2558 // Replaces legVec during post-selection cleanup. See above.
2559 operand legVecZ() %{
2560   constraint(ALLOC_IN_RC(vectorz_reg_legacy));
2561   match(VecZ);
2562 
2563   format %{ %}
2564   interface(REG_INTER);
2565 %}
2566 
2567 // Comparison Code for FP conditional move
2568 operand cmpOp_vcmppd() %{
2569   match(Bool);
2570 
2571   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2572             n->as_Bool()->_test._test != BoolTest::no_overflow);
2573   format %{ "" %}
2574   interface(COND_INTER) %{
2575     equal        (0x0, "eq");
2576     less         (0x1, "lt");
2577     less_equal   (0x2, "le");
2578     not_equal    (0xC, "ne");
2579     greater_equal(0xD, "ge");
2580     greater      (0xE, "gt");
2581     //TODO cannot compile (adlc breaks) without two next lines with error:
2582     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2583     // equal' for overflow.
2584     overflow     (0x20, "o");  // not really supported by the instruction
2585     no_overflow  (0x21, "no"); // not really supported by the instruction
2586   %}
2587 %}
2588 
2589 
2590 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2591 
2592 // ============================================================================
2593 
2594 instruct ShouldNotReachHere() %{
2595   match(Halt);
2596   format %{ "stop\t# ShouldNotReachHere" %}
2597   ins_encode %{
2598     if (is_reachable()) {
2599       __ stop(_halt_reason);
2600     }
2601   %}
2602   ins_pipe(pipe_slow);
2603 %}
2604 
2605 // =================================EVEX special===============================
2606 // Existing partial implementation for post-loop multi-versioning computes
2607 // the mask corresponding to tail loop in K1 opmask register. This may then be
2608 // used for predicating instructions in loop body during last post-loop iteration.
2609 // TODO: Remove hard-coded K1 usage while fixing existing post-loop
2610 // multiversioning support.
2611 instruct setMask(rRegI dst, rRegI src, kReg_K1 mask) %{
2612   predicate(PostLoopMultiversioning && Matcher::has_predicated_vectors());
2613   match(Set dst (SetVectMaskI  src));
2614   effect(TEMP dst);
2615   format %{ "setvectmask   $dst, $src" %}
2616   ins_encode %{
2617     __ setvectmask($dst$$Register, $src$$Register, $mask$$KRegister);
2618   %}
2619   ins_pipe(pipe_slow);
2620 %}
2621 
2622 // ============================================================================
2623 
2624 instruct addF_reg(regF dst, regF src) %{
2625   predicate((UseSSE>=1) && (UseAVX == 0));
2626   match(Set dst (AddF dst src));
2627 
2628   format %{ "addss   $dst, $src" %}
2629   ins_cost(150);
2630   ins_encode %{
2631     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2632   %}
2633   ins_pipe(pipe_slow);
2634 %}
2635 
2636 instruct addF_mem(regF dst, memory src) %{
2637   predicate((UseSSE>=1) && (UseAVX == 0));
2638   match(Set dst (AddF dst (LoadF src)));
2639 
2640   format %{ "addss   $dst, $src" %}
2641   ins_cost(150);
2642   ins_encode %{
2643     __ addss($dst$$XMMRegister, $src$$Address);
2644   %}
2645   ins_pipe(pipe_slow);
2646 %}
2647 
2648 instruct addF_imm(regF dst, immF con) %{
2649   predicate((UseSSE>=1) && (UseAVX == 0));
2650   match(Set dst (AddF dst con));
2651   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2652   ins_cost(150);
2653   ins_encode %{
2654     __ addss($dst$$XMMRegister, $constantaddress($con));
2655   %}
2656   ins_pipe(pipe_slow);
2657 %}
2658 
2659 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2660   predicate(UseAVX > 0);
2661   match(Set dst (AddF src1 src2));
2662 
2663   format %{ "vaddss  $dst, $src1, $src2" %}
2664   ins_cost(150);
2665   ins_encode %{
2666     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2667   %}
2668   ins_pipe(pipe_slow);
2669 %}
2670 
2671 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2672   predicate(UseAVX > 0);
2673   match(Set dst (AddF src1 (LoadF src2)));
2674 
2675   format %{ "vaddss  $dst, $src1, $src2" %}
2676   ins_cost(150);
2677   ins_encode %{
2678     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2679   %}
2680   ins_pipe(pipe_slow);
2681 %}
2682 
2683 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2684   predicate(UseAVX > 0);
2685   match(Set dst (AddF src con));
2686 
2687   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2688   ins_cost(150);
2689   ins_encode %{
2690     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2691   %}
2692   ins_pipe(pipe_slow);
2693 %}
2694 
2695 instruct addD_reg(regD dst, regD src) %{
2696   predicate((UseSSE>=2) && (UseAVX == 0));
2697   match(Set dst (AddD dst src));
2698 
2699   format %{ "addsd   $dst, $src" %}
2700   ins_cost(150);
2701   ins_encode %{
2702     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2703   %}
2704   ins_pipe(pipe_slow);
2705 %}
2706 
2707 instruct addD_mem(regD dst, memory src) %{
2708   predicate((UseSSE>=2) && (UseAVX == 0));
2709   match(Set dst (AddD dst (LoadD src)));
2710 
2711   format %{ "addsd   $dst, $src" %}
2712   ins_cost(150);
2713   ins_encode %{
2714     __ addsd($dst$$XMMRegister, $src$$Address);
2715   %}
2716   ins_pipe(pipe_slow);
2717 %}
2718 
2719 instruct addD_imm(regD dst, immD con) %{
2720   predicate((UseSSE>=2) && (UseAVX == 0));
2721   match(Set dst (AddD dst con));
2722   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2723   ins_cost(150);
2724   ins_encode %{
2725     __ addsd($dst$$XMMRegister, $constantaddress($con));
2726   %}
2727   ins_pipe(pipe_slow);
2728 %}
2729 
2730 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2731   predicate(UseAVX > 0);
2732   match(Set dst (AddD src1 src2));
2733 
2734   format %{ "vaddsd  $dst, $src1, $src2" %}
2735   ins_cost(150);
2736   ins_encode %{
2737     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2738   %}
2739   ins_pipe(pipe_slow);
2740 %}
2741 
2742 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2743   predicate(UseAVX > 0);
2744   match(Set dst (AddD src1 (LoadD src2)));
2745 
2746   format %{ "vaddsd  $dst, $src1, $src2" %}
2747   ins_cost(150);
2748   ins_encode %{
2749     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2750   %}
2751   ins_pipe(pipe_slow);
2752 %}
2753 
2754 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2755   predicate(UseAVX > 0);
2756   match(Set dst (AddD src con));
2757 
2758   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2759   ins_cost(150);
2760   ins_encode %{
2761     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2762   %}
2763   ins_pipe(pipe_slow);
2764 %}
2765 
2766 instruct subF_reg(regF dst, regF src) %{
2767   predicate((UseSSE>=1) && (UseAVX == 0));
2768   match(Set dst (SubF dst src));
2769 
2770   format %{ "subss   $dst, $src" %}
2771   ins_cost(150);
2772   ins_encode %{
2773     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2774   %}
2775   ins_pipe(pipe_slow);
2776 %}
2777 
2778 instruct subF_mem(regF dst, memory src) %{
2779   predicate((UseSSE>=1) && (UseAVX == 0));
2780   match(Set dst (SubF dst (LoadF src)));
2781 
2782   format %{ "subss   $dst, $src" %}
2783   ins_cost(150);
2784   ins_encode %{
2785     __ subss($dst$$XMMRegister, $src$$Address);
2786   %}
2787   ins_pipe(pipe_slow);
2788 %}
2789 
2790 instruct subF_imm(regF dst, immF con) %{
2791   predicate((UseSSE>=1) && (UseAVX == 0));
2792   match(Set dst (SubF dst con));
2793   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2794   ins_cost(150);
2795   ins_encode %{
2796     __ subss($dst$$XMMRegister, $constantaddress($con));
2797   %}
2798   ins_pipe(pipe_slow);
2799 %}
2800 
2801 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2802   predicate(UseAVX > 0);
2803   match(Set dst (SubF src1 src2));
2804 
2805   format %{ "vsubss  $dst, $src1, $src2" %}
2806   ins_cost(150);
2807   ins_encode %{
2808     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2809   %}
2810   ins_pipe(pipe_slow);
2811 %}
2812 
2813 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2814   predicate(UseAVX > 0);
2815   match(Set dst (SubF src1 (LoadF src2)));
2816 
2817   format %{ "vsubss  $dst, $src1, $src2" %}
2818   ins_cost(150);
2819   ins_encode %{
2820     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2821   %}
2822   ins_pipe(pipe_slow);
2823 %}
2824 
2825 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2826   predicate(UseAVX > 0);
2827   match(Set dst (SubF src con));
2828 
2829   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2830   ins_cost(150);
2831   ins_encode %{
2832     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2833   %}
2834   ins_pipe(pipe_slow);
2835 %}
2836 
2837 instruct subD_reg(regD dst, regD src) %{
2838   predicate((UseSSE>=2) && (UseAVX == 0));
2839   match(Set dst (SubD dst src));
2840 
2841   format %{ "subsd   $dst, $src" %}
2842   ins_cost(150);
2843   ins_encode %{
2844     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2845   %}
2846   ins_pipe(pipe_slow);
2847 %}
2848 
2849 instruct subD_mem(regD dst, memory src) %{
2850   predicate((UseSSE>=2) && (UseAVX == 0));
2851   match(Set dst (SubD dst (LoadD src)));
2852 
2853   format %{ "subsd   $dst, $src" %}
2854   ins_cost(150);
2855   ins_encode %{
2856     __ subsd($dst$$XMMRegister, $src$$Address);
2857   %}
2858   ins_pipe(pipe_slow);
2859 %}
2860 
2861 instruct subD_imm(regD dst, immD con) %{
2862   predicate((UseSSE>=2) && (UseAVX == 0));
2863   match(Set dst (SubD dst con));
2864   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2865   ins_cost(150);
2866   ins_encode %{
2867     __ subsd($dst$$XMMRegister, $constantaddress($con));
2868   %}
2869   ins_pipe(pipe_slow);
2870 %}
2871 
2872 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2873   predicate(UseAVX > 0);
2874   match(Set dst (SubD src1 src2));
2875 
2876   format %{ "vsubsd  $dst, $src1, $src2" %}
2877   ins_cost(150);
2878   ins_encode %{
2879     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2880   %}
2881   ins_pipe(pipe_slow);
2882 %}
2883 
2884 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2885   predicate(UseAVX > 0);
2886   match(Set dst (SubD src1 (LoadD src2)));
2887 
2888   format %{ "vsubsd  $dst, $src1, $src2" %}
2889   ins_cost(150);
2890   ins_encode %{
2891     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2892   %}
2893   ins_pipe(pipe_slow);
2894 %}
2895 
2896 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2897   predicate(UseAVX > 0);
2898   match(Set dst (SubD src con));
2899 
2900   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2901   ins_cost(150);
2902   ins_encode %{
2903     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2904   %}
2905   ins_pipe(pipe_slow);
2906 %}
2907 
2908 instruct mulF_reg(regF dst, regF src) %{
2909   predicate((UseSSE>=1) && (UseAVX == 0));
2910   match(Set dst (MulF dst src));
2911 
2912   format %{ "mulss   $dst, $src" %}
2913   ins_cost(150);
2914   ins_encode %{
2915     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2916   %}
2917   ins_pipe(pipe_slow);
2918 %}
2919 
2920 instruct mulF_mem(regF dst, memory src) %{
2921   predicate((UseSSE>=1) && (UseAVX == 0));
2922   match(Set dst (MulF dst (LoadF src)));
2923 
2924   format %{ "mulss   $dst, $src" %}
2925   ins_cost(150);
2926   ins_encode %{
2927     __ mulss($dst$$XMMRegister, $src$$Address);
2928   %}
2929   ins_pipe(pipe_slow);
2930 %}
2931 
2932 instruct mulF_imm(regF dst, immF con) %{
2933   predicate((UseSSE>=1) && (UseAVX == 0));
2934   match(Set dst (MulF dst con));
2935   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2936   ins_cost(150);
2937   ins_encode %{
2938     __ mulss($dst$$XMMRegister, $constantaddress($con));
2939   %}
2940   ins_pipe(pipe_slow);
2941 %}
2942 
2943 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2944   predicate(UseAVX > 0);
2945   match(Set dst (MulF src1 src2));
2946 
2947   format %{ "vmulss  $dst, $src1, $src2" %}
2948   ins_cost(150);
2949   ins_encode %{
2950     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2951   %}
2952   ins_pipe(pipe_slow);
2953 %}
2954 
2955 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2956   predicate(UseAVX > 0);
2957   match(Set dst (MulF src1 (LoadF src2)));
2958 
2959   format %{ "vmulss  $dst, $src1, $src2" %}
2960   ins_cost(150);
2961   ins_encode %{
2962     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2963   %}
2964   ins_pipe(pipe_slow);
2965 %}
2966 
2967 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2968   predicate(UseAVX > 0);
2969   match(Set dst (MulF src con));
2970 
2971   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2972   ins_cost(150);
2973   ins_encode %{
2974     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2975   %}
2976   ins_pipe(pipe_slow);
2977 %}
2978 
2979 instruct mulD_reg(regD dst, regD src) %{
2980   predicate((UseSSE>=2) && (UseAVX == 0));
2981   match(Set dst (MulD dst src));
2982 
2983   format %{ "mulsd   $dst, $src" %}
2984   ins_cost(150);
2985   ins_encode %{
2986     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2987   %}
2988   ins_pipe(pipe_slow);
2989 %}
2990 
2991 instruct mulD_mem(regD dst, memory src) %{
2992   predicate((UseSSE>=2) && (UseAVX == 0));
2993   match(Set dst (MulD dst (LoadD src)));
2994 
2995   format %{ "mulsd   $dst, $src" %}
2996   ins_cost(150);
2997   ins_encode %{
2998     __ mulsd($dst$$XMMRegister, $src$$Address);
2999   %}
3000   ins_pipe(pipe_slow);
3001 %}
3002 
3003 instruct mulD_imm(regD dst, immD con) %{
3004   predicate((UseSSE>=2) && (UseAVX == 0));
3005   match(Set dst (MulD dst con));
3006   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3007   ins_cost(150);
3008   ins_encode %{
3009     __ mulsd($dst$$XMMRegister, $constantaddress($con));
3010   %}
3011   ins_pipe(pipe_slow);
3012 %}
3013 
3014 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
3015   predicate(UseAVX > 0);
3016   match(Set dst (MulD src1 src2));
3017 
3018   format %{ "vmulsd  $dst, $src1, $src2" %}
3019   ins_cost(150);
3020   ins_encode %{
3021     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3022   %}
3023   ins_pipe(pipe_slow);
3024 %}
3025 
3026 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
3027   predicate(UseAVX > 0);
3028   match(Set dst (MulD src1 (LoadD src2)));
3029 
3030   format %{ "vmulsd  $dst, $src1, $src2" %}
3031   ins_cost(150);
3032   ins_encode %{
3033     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3034   %}
3035   ins_pipe(pipe_slow);
3036 %}
3037 
3038 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
3039   predicate(UseAVX > 0);
3040   match(Set dst (MulD src con));
3041 
3042   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3043   ins_cost(150);
3044   ins_encode %{
3045     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3046   %}
3047   ins_pipe(pipe_slow);
3048 %}
3049 
3050 instruct divF_reg(regF dst, regF src) %{
3051   predicate((UseSSE>=1) && (UseAVX == 0));
3052   match(Set dst (DivF dst src));
3053 
3054   format %{ "divss   $dst, $src" %}
3055   ins_cost(150);
3056   ins_encode %{
3057     __ divss($dst$$XMMRegister, $src$$XMMRegister);
3058   %}
3059   ins_pipe(pipe_slow);
3060 %}
3061 
3062 instruct divF_mem(regF dst, memory src) %{
3063   predicate((UseSSE>=1) && (UseAVX == 0));
3064   match(Set dst (DivF dst (LoadF src)));
3065 
3066   format %{ "divss   $dst, $src" %}
3067   ins_cost(150);
3068   ins_encode %{
3069     __ divss($dst$$XMMRegister, $src$$Address);
3070   %}
3071   ins_pipe(pipe_slow);
3072 %}
3073 
3074 instruct divF_imm(regF dst, immF con) %{
3075   predicate((UseSSE>=1) && (UseAVX == 0));
3076   match(Set dst (DivF dst con));
3077   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
3078   ins_cost(150);
3079   ins_encode %{
3080     __ divss($dst$$XMMRegister, $constantaddress($con));
3081   %}
3082   ins_pipe(pipe_slow);
3083 %}
3084 
3085 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
3086   predicate(UseAVX > 0);
3087   match(Set dst (DivF src1 src2));
3088 
3089   format %{ "vdivss  $dst, $src1, $src2" %}
3090   ins_cost(150);
3091   ins_encode %{
3092     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3093   %}
3094   ins_pipe(pipe_slow);
3095 %}
3096 
3097 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
3098   predicate(UseAVX > 0);
3099   match(Set dst (DivF src1 (LoadF src2)));
3100 
3101   format %{ "vdivss  $dst, $src1, $src2" %}
3102   ins_cost(150);
3103   ins_encode %{
3104     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3105   %}
3106   ins_pipe(pipe_slow);
3107 %}
3108 
3109 instruct divF_reg_imm(regF dst, regF src, immF con) %{
3110   predicate(UseAVX > 0);
3111   match(Set dst (DivF src con));
3112 
3113   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
3114   ins_cost(150);
3115   ins_encode %{
3116     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3117   %}
3118   ins_pipe(pipe_slow);
3119 %}
3120 
3121 instruct divD_reg(regD dst, regD src) %{
3122   predicate((UseSSE>=2) && (UseAVX == 0));
3123   match(Set dst (DivD dst src));
3124 
3125   format %{ "divsd   $dst, $src" %}
3126   ins_cost(150);
3127   ins_encode %{
3128     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
3129   %}
3130   ins_pipe(pipe_slow);
3131 %}
3132 
3133 instruct divD_mem(regD dst, memory src) %{
3134   predicate((UseSSE>=2) && (UseAVX == 0));
3135   match(Set dst (DivD dst (LoadD src)));
3136 
3137   format %{ "divsd   $dst, $src" %}
3138   ins_cost(150);
3139   ins_encode %{
3140     __ divsd($dst$$XMMRegister, $src$$Address);
3141   %}
3142   ins_pipe(pipe_slow);
3143 %}
3144 
3145 instruct divD_imm(regD dst, immD con) %{
3146   predicate((UseSSE>=2) && (UseAVX == 0));
3147   match(Set dst (DivD dst con));
3148   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
3149   ins_cost(150);
3150   ins_encode %{
3151     __ divsd($dst$$XMMRegister, $constantaddress($con));
3152   %}
3153   ins_pipe(pipe_slow);
3154 %}
3155 
3156 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
3157   predicate(UseAVX > 0);
3158   match(Set dst (DivD src1 src2));
3159 
3160   format %{ "vdivsd  $dst, $src1, $src2" %}
3161   ins_cost(150);
3162   ins_encode %{
3163     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
3164   %}
3165   ins_pipe(pipe_slow);
3166 %}
3167 
3168 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
3169   predicate(UseAVX > 0);
3170   match(Set dst (DivD src1 (LoadD src2)));
3171 
3172   format %{ "vdivsd  $dst, $src1, $src2" %}
3173   ins_cost(150);
3174   ins_encode %{
3175     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
3176   %}
3177   ins_pipe(pipe_slow);
3178 %}
3179 
3180 instruct divD_reg_imm(regD dst, regD src, immD con) %{
3181   predicate(UseAVX > 0);
3182   match(Set dst (DivD src con));
3183 
3184   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
3185   ins_cost(150);
3186   ins_encode %{
3187     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
3188   %}
3189   ins_pipe(pipe_slow);
3190 %}
3191 
3192 instruct absF_reg(regF dst) %{
3193   predicate((UseSSE>=1) && (UseAVX == 0));
3194   match(Set dst (AbsF dst));
3195   ins_cost(150);
3196   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
3197   ins_encode %{
3198     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
3199   %}
3200   ins_pipe(pipe_slow);
3201 %}
3202 
3203 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
3204   predicate(UseAVX > 0);
3205   match(Set dst (AbsF src));
3206   ins_cost(150);
3207   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
3208   ins_encode %{
3209     int vlen_enc = Assembler::AVX_128bit;
3210     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
3211               ExternalAddress(float_signmask()), vlen_enc);
3212   %}
3213   ins_pipe(pipe_slow);
3214 %}
3215 
3216 instruct absD_reg(regD dst) %{
3217   predicate((UseSSE>=2) && (UseAVX == 0));
3218   match(Set dst (AbsD dst));
3219   ins_cost(150);
3220   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
3221             "# abs double by sign masking" %}
3222   ins_encode %{
3223     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
3224   %}
3225   ins_pipe(pipe_slow);
3226 %}
3227 
3228 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
3229   predicate(UseAVX > 0);
3230   match(Set dst (AbsD src));
3231   ins_cost(150);
3232   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
3233             "# abs double by sign masking" %}
3234   ins_encode %{
3235     int vlen_enc = Assembler::AVX_128bit;
3236     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
3237               ExternalAddress(double_signmask()), vlen_enc);
3238   %}
3239   ins_pipe(pipe_slow);
3240 %}
3241 
3242 instruct negF_reg(regF dst) %{
3243   predicate((UseSSE>=1) && (UseAVX == 0));
3244   match(Set dst (NegF dst));
3245   ins_cost(150);
3246   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
3247   ins_encode %{
3248     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
3249   %}
3250   ins_pipe(pipe_slow);
3251 %}
3252 
3253 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
3254   predicate(UseAVX > 0);
3255   match(Set dst (NegF src));
3256   ins_cost(150);
3257   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
3258   ins_encode %{
3259     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
3260                  ExternalAddress(float_signflip()));
3261   %}
3262   ins_pipe(pipe_slow);
3263 %}
3264 
3265 instruct negD_reg(regD dst) %{
3266   predicate((UseSSE>=2) && (UseAVX == 0));
3267   match(Set dst (NegD dst));
3268   ins_cost(150);
3269   format %{ "xorpd   $dst, [0x8000000000000000]\t"
3270             "# neg double by sign flipping" %}
3271   ins_encode %{
3272     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
3273   %}
3274   ins_pipe(pipe_slow);
3275 %}
3276 
3277 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
3278   predicate(UseAVX > 0);
3279   match(Set dst (NegD src));
3280   ins_cost(150);
3281   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
3282             "# neg double by sign flipping" %}
3283   ins_encode %{
3284     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
3285                  ExternalAddress(double_signflip()));
3286   %}
3287   ins_pipe(pipe_slow);
3288 %}
3289 
3290 // sqrtss instruction needs destination register to be pre initialized for best performance
3291 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3292 instruct sqrtF_reg(regF dst) %{
3293   predicate(UseSSE>=1);
3294   match(Set dst (SqrtF dst));
3295   format %{ "sqrtss  $dst, $dst" %}
3296   ins_encode %{
3297     __ sqrtss($dst$$XMMRegister, $dst$$XMMRegister);
3298   %}
3299   ins_pipe(pipe_slow);
3300 %}
3301 
3302 // sqrtsd instruction needs destination register to be pre initialized for best performance
3303 // Therefore only the instruct rule where the input is pre-loaded into dst register is defined below
3304 instruct sqrtD_reg(regD dst) %{
3305   predicate(UseSSE>=2);
3306   match(Set dst (SqrtD dst));
3307   format %{ "sqrtsd  $dst, $dst" %}
3308   ins_encode %{
3309     __ sqrtsd($dst$$XMMRegister, $dst$$XMMRegister);
3310   %}
3311   ins_pipe(pipe_slow);
3312 %}
3313 
3314 // ---------------------------------------- VectorReinterpret ------------------------------------
3315 
3316 instruct reinterpret(vec dst) %{
3317   predicate(Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))); // dst == src
3318   match(Set dst (VectorReinterpret dst));
3319   ins_cost(125);
3320   format %{ "vector_reinterpret $dst\t!" %}
3321   ins_encode %{
3322     // empty
3323   %}
3324   ins_pipe( pipe_slow );
3325 %}
3326 
3327 instruct reinterpret_expand(vec dst, vec src, rRegP scratch) %{
3328   predicate(UseAVX == 0 &&
3329             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3330   match(Set dst (VectorReinterpret src));
3331   ins_cost(125);
3332   effect(TEMP dst, TEMP scratch);
3333   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3334   ins_encode %{
3335     assert(Matcher::vector_length_in_bytes(this)       <= 16, "required");
3336     assert(Matcher::vector_length_in_bytes(this, $src) <=  8, "required");
3337 
3338     int src_vlen_in_bytes = Matcher::vector_length_in_bytes(this, $src);
3339     if (src_vlen_in_bytes == 4) {
3340       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_32_bit_mask()), $scratch$$Register);
3341     } else {
3342       assert(src_vlen_in_bytes == 8, "");
3343       __ movdqu($dst$$XMMRegister, ExternalAddress(vector_64_bit_mask()), $scratch$$Register);
3344     }
3345     __ pand($dst$$XMMRegister, $src$$XMMRegister);
3346   %}
3347   ins_pipe( pipe_slow );
3348 %}
3349 
3350 instruct vreinterpret_expand4(legVec dst, vec src, rRegP scratch) %{
3351   predicate(UseAVX > 0 &&
3352             (Matcher::vector_length_in_bytes(n->in(1)) == 4) && // src
3353             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3354   match(Set dst (VectorReinterpret src));
3355   ins_cost(125);
3356   effect(TEMP scratch);
3357   format %{ "vector_reinterpret_expand $dst,$src\t! using $scratch as TEMP" %}
3358   ins_encode %{
3359     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_32_bit_mask()), 0, $scratch$$Register);
3360   %}
3361   ins_pipe( pipe_slow );
3362 %}
3363 
3364 
3365 instruct vreinterpret_expand(legVec dst, vec src) %{
3366   predicate(UseAVX > 0 &&
3367             (Matcher::vector_length_in_bytes(n->in(1)) > 4) && // src
3368             (Matcher::vector_length_in_bytes(n->in(1)) < Matcher::vector_length_in_bytes(n))); // src < dst
3369   match(Set dst (VectorReinterpret src));
3370   ins_cost(125);
3371   format %{ "vector_reinterpret_expand $dst,$src\t!" %}
3372   ins_encode %{
3373     switch (Matcher::vector_length_in_bytes(this, $src)) {
3374       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3375       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3376       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3377       default: ShouldNotReachHere();
3378     }
3379   %}
3380   ins_pipe( pipe_slow );
3381 %}
3382 
3383 instruct reinterpret_shrink(vec dst, legVec src) %{
3384   predicate(Matcher::vector_length_in_bytes(n->in(1)) > Matcher::vector_length_in_bytes(n)); // src > dst
3385   match(Set dst (VectorReinterpret src));
3386   ins_cost(125);
3387   format %{ "vector_reinterpret_shrink $dst,$src\t!" %}
3388   ins_encode %{
3389     switch (Matcher::vector_length_in_bytes(this)) {
3390       case  4: __ movfltz($dst$$XMMRegister, $src$$XMMRegister); break;
3391       case  8: __ movq   ($dst$$XMMRegister, $src$$XMMRegister); break;
3392       case 16: __ movdqu ($dst$$XMMRegister, $src$$XMMRegister); break;
3393       case 32: __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister); break;
3394       default: ShouldNotReachHere();
3395     }
3396   %}
3397   ins_pipe( pipe_slow );
3398 %}
3399 
3400 // ----------------------------------------------------------------------------------------------------
3401 
3402 #ifdef _LP64
3403 instruct roundD_reg(legRegD dst, legRegD src, immU8 rmode) %{
3404   match(Set dst (RoundDoubleMode src rmode));
3405   format %{ "roundsd $dst,$src" %}
3406   ins_cost(150);
3407   ins_encode %{
3408     assert(UseSSE >= 4, "required");
3409     __ roundsd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant);
3410   %}
3411   ins_pipe(pipe_slow);
3412 %}
3413 
3414 instruct roundD_mem(legRegD dst, memory src, immU8 rmode) %{
3415   match(Set dst (RoundDoubleMode (LoadD src) rmode));
3416   format %{ "roundsd $dst,$src" %}
3417   ins_cost(150);
3418   ins_encode %{
3419     assert(UseSSE >= 4, "required");
3420     __ roundsd($dst$$XMMRegister, $src$$Address, $rmode$$constant);
3421   %}
3422   ins_pipe(pipe_slow);
3423 %}
3424 
3425 instruct roundD_imm(legRegD dst, immD con, immU8 rmode, rRegI scratch_reg) %{
3426   match(Set dst (RoundDoubleMode con rmode));
3427   effect(TEMP scratch_reg);
3428   format %{ "roundsd $dst,[$constantaddress]\t# load from constant table: double=$con" %}
3429   ins_cost(150);
3430   ins_encode %{
3431     assert(UseSSE >= 4, "required");
3432     __ roundsd($dst$$XMMRegister, $constantaddress($con), $rmode$$constant, $scratch_reg$$Register);
3433   %}
3434   ins_pipe(pipe_slow);
3435 %}
3436 
3437 instruct vroundD_reg(legVec dst, legVec src, immU8 rmode) %{
3438   predicate(Matcher::vector_length(n) < 8);
3439   match(Set dst (RoundDoubleModeV src rmode));
3440   format %{ "vroundpd $dst,$src,$rmode\t! round packedD" %}
3441   ins_encode %{
3442     assert(UseAVX > 0, "required");
3443     int vlen_enc = vector_length_encoding(this);
3444     __ vroundpd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, vlen_enc);
3445   %}
3446   ins_pipe( pipe_slow );
3447 %}
3448 
3449 instruct vround8D_reg(vec dst, vec src, immU8 rmode) %{
3450   predicate(Matcher::vector_length(n) == 8);
3451   match(Set dst (RoundDoubleModeV src rmode));
3452   format %{ "vrndscalepd $dst,$src,$rmode\t! round packed8D" %}
3453   ins_encode %{
3454     assert(UseAVX > 2, "required");
3455     __ vrndscalepd($dst$$XMMRegister, $src$$XMMRegister, $rmode$$constant, Assembler::AVX_512bit);
3456   %}
3457   ins_pipe( pipe_slow );
3458 %}
3459 
3460 instruct vroundD_mem(legVec dst, memory mem, immU8 rmode) %{
3461   predicate(Matcher::vector_length(n) < 8);
3462   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3463   format %{ "vroundpd $dst, $mem, $rmode\t! round packedD" %}
3464   ins_encode %{
3465     assert(UseAVX > 0, "required");
3466     int vlen_enc = vector_length_encoding(this);
3467     __ vroundpd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, vlen_enc);
3468   %}
3469   ins_pipe( pipe_slow );
3470 %}
3471 
3472 instruct vround8D_mem(vec dst, memory mem, immU8 rmode) %{
3473   predicate(Matcher::vector_length(n) == 8);
3474   match(Set dst (RoundDoubleModeV (LoadVector mem) rmode));
3475   format %{ "vrndscalepd $dst,$mem,$rmode\t! round packed8D" %}
3476   ins_encode %{
3477     assert(UseAVX > 2, "required");
3478     __ vrndscalepd($dst$$XMMRegister, $mem$$Address, $rmode$$constant, Assembler::AVX_512bit);
3479   %}
3480   ins_pipe( pipe_slow );
3481 %}
3482 #endif // _LP64
3483 
3484 instruct onspinwait() %{
3485   match(OnSpinWait);
3486   ins_cost(200);
3487 
3488   format %{
3489     $$template
3490     $$emit$$"pause\t! membar_onspinwait"
3491   %}
3492   ins_encode %{
3493     __ pause();
3494   %}
3495   ins_pipe(pipe_slow);
3496 %}
3497 
3498 // a * b + c
3499 instruct fmaD_reg(regD a, regD b, regD c) %{
3500   predicate(UseFMA);
3501   match(Set c (FmaD  c (Binary a b)));
3502   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
3503   ins_cost(150);
3504   ins_encode %{
3505     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3506   %}
3507   ins_pipe( pipe_slow );
3508 %}
3509 
3510 // a * b + c
3511 instruct fmaF_reg(regF a, regF b, regF c) %{
3512   predicate(UseFMA);
3513   match(Set c (FmaF  c (Binary a b)));
3514   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
3515   ins_cost(150);
3516   ins_encode %{
3517     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
3518   %}
3519   ins_pipe( pipe_slow );
3520 %}
3521 
3522 // ====================VECTOR INSTRUCTIONS=====================================
3523 
3524 // Dummy reg-to-reg vector moves. Removed during post-selection cleanup.
3525 instruct MoveVec2Leg(legVec dst, vec src) %{
3526   match(Set dst src);
3527   format %{ "" %}
3528   ins_encode %{
3529     ShouldNotReachHere();
3530   %}
3531   ins_pipe( fpu_reg_reg );
3532 %}
3533 
3534 instruct MoveLeg2Vec(vec dst, legVec src) %{
3535   match(Set dst src);
3536   format %{ "" %}
3537   ins_encode %{
3538     ShouldNotReachHere();
3539   %}
3540   ins_pipe( fpu_reg_reg );
3541 %}
3542 
3543 // ============================================================================
3544 
3545 // Load vectors generic operand pattern
3546 instruct loadV(vec dst, memory mem) %{
3547   match(Set dst (LoadVector mem));
3548   ins_cost(125);
3549   format %{ "load_vector $dst,$mem" %}
3550   ins_encode %{
3551     switch (Matcher::vector_length_in_bytes(this)) {
3552       case  4: __ movdl    ($dst$$XMMRegister, $mem$$Address); break;
3553       case  8: __ movq     ($dst$$XMMRegister, $mem$$Address); break;
3554       case 16: __ movdqu   ($dst$$XMMRegister, $mem$$Address); break;
3555       case 32: __ vmovdqu  ($dst$$XMMRegister, $mem$$Address); break;
3556       case 64: __ evmovdqul($dst$$XMMRegister, $mem$$Address, Assembler::AVX_512bit); break;
3557       default: ShouldNotReachHere();
3558     }
3559   %}
3560   ins_pipe( pipe_slow );
3561 %}
3562 
3563 // Store vectors generic operand pattern.
3564 instruct storeV(memory mem, vec src) %{
3565   match(Set mem (StoreVector mem src));
3566   ins_cost(145);
3567   format %{ "store_vector $mem,$src\n\t" %}
3568   ins_encode %{
3569     switch (Matcher::vector_length_in_bytes(this, $src)) {
3570       case  4: __ movdl    ($mem$$Address, $src$$XMMRegister); break;
3571       case  8: __ movq     ($mem$$Address, $src$$XMMRegister); break;
3572       case 16: __ movdqu   ($mem$$Address, $src$$XMMRegister); break;
3573       case 32: __ vmovdqu  ($mem$$Address, $src$$XMMRegister); break;
3574       case 64: __ evmovdqul($mem$$Address, $src$$XMMRegister, Assembler::AVX_512bit); break;
3575       default: ShouldNotReachHere();
3576     }
3577   %}
3578   ins_pipe( pipe_slow );
3579 %}
3580 
3581 // ---------------------------------------- Gather ------------------------------------
3582 
3583 // Gather INT, LONG, FLOAT, DOUBLE
3584 
3585 instruct gather(legVec dst, memory mem, legVec idx, rRegP tmp, legVec mask) %{
3586   predicate(Matcher::vector_length_in_bytes(n) <= 32);
3587   match(Set dst (LoadVectorGather mem idx));
3588   effect(TEMP dst, TEMP tmp, TEMP mask);
3589   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and $mask as TEMP" %}
3590   ins_encode %{
3591     assert(UseAVX >= 2, "sanity");
3592 
3593     int vlen_enc = vector_length_encoding(this);
3594     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3595 
3596     assert(Matcher::vector_length_in_bytes(this) >= 16, "sanity");
3597     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3598 
3599     if (vlen_enc == Assembler::AVX_128bit) {
3600       __ movdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3601     } else {
3602       __ vmovdqu($mask$$XMMRegister, ExternalAddress(vector_all_bits_set()));
3603     }
3604     __ lea($tmp$$Register, $mem$$Address);
3605     __ vgather(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx$$XMMRegister, $mask$$XMMRegister, vlen_enc);
3606   %}
3607   ins_pipe( pipe_slow );
3608 %}
3609 
3610 instruct evgather(vec dst, memory mem, vec idx, rRegP tmp, kReg ktmp) %{
3611   predicate(Matcher::vector_length_in_bytes(n) == 64);
3612   match(Set dst (LoadVectorGather mem idx));
3613   effect(TEMP dst, TEMP tmp, TEMP ktmp);
3614   format %{ "load_vector_gather $dst, $mem, $idx\t! using $tmp and k2 as TEMP" %}
3615   ins_encode %{
3616     assert(UseAVX > 2, "sanity");
3617 
3618     int vlen_enc = vector_length_encoding(this);
3619     BasicType elem_bt = Matcher::vector_element_basic_type(this);
3620 
3621     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3622 
3623     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3624     __ lea($tmp$$Register, $mem$$Address);
3625     __ evgather(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $tmp$$Register, $idx$$XMMRegister, vlen_enc);
3626   %}
3627   ins_pipe( pipe_slow );
3628 %}
3629 
3630 // ====================Scatter=======================================
3631 
3632 // Scatter INT, LONG, FLOAT, DOUBLE
3633 
3634 instruct scatter(memory mem, vec src, vec idx, rRegP tmp, kReg ktmp) %{
3635   predicate(UseAVX > 2);
3636   match(Set mem (StoreVectorScatter mem (Binary src idx)));
3637   effect(TEMP tmp, TEMP ktmp);
3638   format %{ "store_vector_scatter $mem, $idx, $src\t! using k2 and $tmp as TEMP" %}
3639   ins_encode %{
3640     int vlen_enc = vector_length_encoding(this, $src);
3641     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
3642 
3643     assert(Matcher::vector_length_in_bytes(this, $src) >= 16, "sanity");
3644     assert(!is_subword_type(elem_bt), "sanity"); // T_INT, T_LONG, T_FLOAT, T_DOUBLE
3645 
3646     __ kmovwl($ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), $tmp$$Register);
3647     __ lea($tmp$$Register, $mem$$Address);
3648     __ evscatter(elem_bt, $tmp$$Register, $idx$$XMMRegister, $ktmp$$KRegister, $src$$XMMRegister, vlen_enc);
3649   %}
3650   ins_pipe( pipe_slow );
3651 %}
3652 
3653 // ====================REPLICATE=======================================
3654 
3655 // Replicate byte scalar to be vector
3656 instruct ReplB_reg(vec dst, rRegI src) %{
3657   match(Set dst (ReplicateB src));
3658   format %{ "replicateB $dst,$src" %}
3659   ins_encode %{
3660     uint vlen = Matcher::vector_length(this);
3661     if (vlen == 64 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
3662       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit byte vectors assume AVX512BW
3663       int vlen_enc = vector_length_encoding(this);
3664       __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vlen_enc);
3665     } else if (VM_Version::supports_avx2()) {
3666       int vlen_enc = vector_length_encoding(this);
3667       __ movdl($dst$$XMMRegister, $src$$Register);
3668       __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3669     } else {
3670       __ movdl($dst$$XMMRegister, $src$$Register);
3671       __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3672       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3673       if (vlen >= 16) {
3674         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3675         if (vlen >= 32) {
3676           assert(vlen == 32, "sanity");
3677           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3678         }
3679       }
3680     }
3681   %}
3682   ins_pipe( pipe_slow );
3683 %}
3684 
3685 instruct ReplB_mem(vec dst, memory mem) %{
3686   predicate(VM_Version::supports_avx2());
3687   match(Set dst (ReplicateB (LoadB mem)));
3688   format %{ "replicateB $dst,$mem" %}
3689   ins_encode %{
3690     int vlen_enc = vector_length_encoding(this);
3691     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vlen_enc);
3692   %}
3693   ins_pipe( pipe_slow );
3694 %}
3695 
3696 instruct ReplB_imm(vec dst, immI con) %{
3697   match(Set dst (ReplicateB con));
3698   format %{ "replicateB $dst,$con" %}
3699   ins_encode %{
3700     uint vlen = Matcher::vector_length(this);
3701     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 1));
3702     if (vlen == 4) {
3703       __ movdl($dst$$XMMRegister, const_addr);
3704     } else {
3705       __ movq($dst$$XMMRegister, const_addr);
3706       if (vlen >= 16) {
3707         if (VM_Version::supports_avx2()) {
3708           int vlen_enc = vector_length_encoding(this);
3709           __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3710         } else {
3711           assert(vlen == 16, "sanity");
3712           __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3713         }
3714       }
3715     }
3716   %}
3717   ins_pipe( pipe_slow );
3718 %}
3719 
3720 // Replicate byte scalar zero to be vector
3721 instruct ReplB_zero(vec dst, immI_0 zero) %{
3722   match(Set dst (ReplicateB zero));
3723   format %{ "replicateB $dst,$zero" %}
3724   ins_encode %{
3725     uint vlen = Matcher::vector_length(this);
3726     if (vlen <= 16) {
3727       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3728     } else {
3729       // Use vpxor since AVX512F does not have 512bit vxorpd (requires AVX512DQ).
3730       int vlen_enc = vector_length_encoding(this);
3731       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3732     }
3733   %}
3734   ins_pipe( fpu_reg_reg );
3735 %}
3736 
3737 // ====================ReplicateS=======================================
3738 
3739 instruct ReplS_reg(vec dst, rRegI src) %{
3740   match(Set dst (ReplicateS src));
3741   format %{ "replicateS $dst,$src" %}
3742   ins_encode %{
3743     uint vlen = Matcher::vector_length(this);
3744     if (vlen == 32 || VM_Version::supports_avx512vlbw()) { // AVX512VL for <512bit operands
3745       assert(VM_Version::supports_avx512bw(), "required"); // 512-bit short vectors assume AVX512BW
3746       int vlen_enc = vector_length_encoding(this);
3747       __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vlen_enc);
3748     } else if (VM_Version::supports_avx2()) {
3749       int vlen_enc = vector_length_encoding(this);
3750       __ movdl($dst$$XMMRegister, $src$$Register);
3751       __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3752     } else {
3753       __ movdl($dst$$XMMRegister, $src$$Register);
3754       __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3755       if (vlen >= 8) {
3756         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3757         if (vlen >= 16) {
3758           assert(vlen == 16, "sanity");
3759           __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3760         }
3761       }
3762     }
3763   %}
3764   ins_pipe( pipe_slow );
3765 %}
3766 
3767 instruct ReplS_mem(vec dst, memory mem) %{
3768   predicate(VM_Version::supports_avx2());
3769   match(Set dst (ReplicateS (LoadS mem)));
3770   format %{ "replicateS $dst,$mem" %}
3771   ins_encode %{
3772     int vlen_enc = vector_length_encoding(this);
3773     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vlen_enc);
3774   %}
3775   ins_pipe( pipe_slow );
3776 %}
3777 
3778 instruct ReplS_imm(vec dst, immI con) %{
3779   match(Set dst (ReplicateS con));
3780   format %{ "replicateS $dst,$con" %}
3781   ins_encode %{
3782     uint vlen = Matcher::vector_length(this);
3783     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 2));
3784     if (vlen == 2) {
3785       __ movdl($dst$$XMMRegister, const_addr);
3786     } else {
3787       __ movq($dst$$XMMRegister, const_addr);
3788       if (vlen >= 8) {
3789         if (VM_Version::supports_avx2()) {
3790           int vlen_enc = vector_length_encoding(this);
3791           __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3792         } else {
3793           assert(vlen == 8, "sanity");
3794           __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3795         }
3796       }
3797     }
3798   %}
3799   ins_pipe( fpu_reg_reg );
3800 %}
3801 
3802 instruct ReplS_zero(vec dst, immI_0 zero) %{
3803   match(Set dst (ReplicateS zero));
3804   format %{ "replicateS $dst,$zero" %}
3805   ins_encode %{
3806     uint vlen = Matcher::vector_length(this);
3807     if (vlen <= 8) {
3808       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3809     } else {
3810       int vlen_enc = vector_length_encoding(this);
3811       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3812     }
3813   %}
3814   ins_pipe( fpu_reg_reg );
3815 %}
3816 
3817 // ====================ReplicateI=======================================
3818 
3819 instruct ReplI_reg(vec dst, rRegI src) %{
3820   match(Set dst (ReplicateI src));
3821   format %{ "replicateI $dst,$src" %}
3822   ins_encode %{
3823     uint vlen = Matcher::vector_length(this);
3824     if (vlen == 16 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3825       int vlen_enc = vector_length_encoding(this);
3826       __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vlen_enc);
3827     } else if (VM_Version::supports_avx2()) {
3828       int vlen_enc = vector_length_encoding(this);
3829       __ movdl($dst$$XMMRegister, $src$$Register);
3830       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3831     } else {
3832       __ movdl($dst$$XMMRegister, $src$$Register);
3833       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3834       if (vlen >= 8) {
3835         assert(vlen == 8, "sanity");
3836         __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3837       }
3838     }
3839   %}
3840   ins_pipe( pipe_slow );
3841 %}
3842 
3843 instruct ReplI_mem(vec dst, memory mem) %{
3844   match(Set dst (ReplicateI (LoadI mem)));
3845   format %{ "replicateI $dst,$mem" %}
3846   ins_encode %{
3847     uint vlen = Matcher::vector_length(this);
3848     if (vlen <= 4) {
3849       __ movdl($dst$$XMMRegister, $mem$$Address);
3850       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3851     } else {
3852       assert(VM_Version::supports_avx2(), "sanity");
3853       int vlen_enc = vector_length_encoding(this);
3854       __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vlen_enc);
3855     }
3856   %}
3857   ins_pipe( pipe_slow );
3858 %}
3859 
3860 instruct ReplI_imm(vec dst, immI con) %{
3861   match(Set dst (ReplicateI con));
3862   format %{ "replicateI $dst,$con" %}
3863   ins_encode %{
3864     uint vlen = Matcher::vector_length(this);
3865     InternalAddress const_addr = $constantaddress(replicate8_imm($con$$constant, 4));
3866     if (vlen <= 4) {
3867       __ movq($dst$$XMMRegister, const_addr);
3868       if (vlen == 4) {
3869         __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3870       }
3871     } else {
3872       assert(VM_Version::supports_avx2(), "sanity");
3873       int vlen_enc = vector_length_encoding(this);
3874       __ movq($dst$$XMMRegister, const_addr);
3875       __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3876     }
3877   %}
3878   ins_pipe( pipe_slow );
3879 %}
3880 
3881 // Replicate integer (4 byte) scalar zero to be vector
3882 instruct ReplI_zero(vec dst, immI_0 zero) %{
3883   match(Set dst (ReplicateI zero));
3884   format %{ "replicateI $dst,$zero" %}
3885   ins_encode %{
3886     uint vlen = Matcher::vector_length(this);
3887     if (vlen <= 4) {
3888       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3889     } else {
3890       int vlen_enc = vector_length_encoding(this);
3891       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3892     }
3893   %}
3894   ins_pipe( fpu_reg_reg );
3895 %}
3896 
3897 instruct ReplI_M1(vec dst, immI_M1 con) %{
3898   predicate(UseAVX > 0);
3899   match(Set dst (ReplicateB con));
3900   match(Set dst (ReplicateS con));
3901   match(Set dst (ReplicateI con));
3902   effect(TEMP dst);
3903   format %{ "vallones $dst" %}
3904   ins_encode %{
3905     int vector_len = vector_length_encoding(this);
3906     __ vallones($dst$$XMMRegister, vector_len);
3907   %}
3908   ins_pipe( pipe_slow );
3909 %}
3910 
3911 // ====================ReplicateL=======================================
3912 
3913 #ifdef _LP64
3914 // Replicate long (8 byte) scalar to be vector
3915 instruct ReplL_reg(vec dst, rRegL src) %{
3916   match(Set dst (ReplicateL src));
3917   format %{ "replicateL $dst,$src" %}
3918   ins_encode %{
3919     uint vlen = Matcher::vector_length(this);
3920     if (vlen == 2) {
3921       __ movdq($dst$$XMMRegister, $src$$Register);
3922       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3923     } else if (vlen == 8 || VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3924       int vlen_enc = vector_length_encoding(this);
3925       __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vlen_enc);
3926     } else if (VM_Version::supports_avx2()) {
3927       assert(vlen == 4, "sanity");
3928       int vlen_enc = vector_length_encoding(this);
3929       __ movdq($dst$$XMMRegister, $src$$Register);
3930       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3931     } else {
3932       assert(vlen == 4, "sanity");
3933       __ movdq($dst$$XMMRegister, $src$$Register);
3934       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3935       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3936     }
3937   %}
3938   ins_pipe( pipe_slow );
3939 %}
3940 #else // _LP64
3941 // Replicate long (8 byte) scalar to be vector
3942 instruct ReplL_reg(vec dst, eRegL src, vec tmp) %{
3943   predicate(Matcher::vector_length(n) <= 4);
3944   match(Set dst (ReplicateL src));
3945   effect(TEMP dst, USE src, TEMP tmp);
3946   format %{ "replicateL $dst,$src" %}
3947   ins_encode %{
3948     uint vlen = Matcher::vector_length(this);
3949     if (vlen == 2) {
3950       __ movdl($dst$$XMMRegister, $src$$Register);
3951       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3952       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3953       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3954     } else if (VM_Version::supports_avx512vl()) { // AVX512VL for <512bit operands
3955       int vlen_enc = Assembler::AVX_256bit;
3956       __ movdl($dst$$XMMRegister, $src$$Register);
3957       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3958       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3959       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3960     } else {
3961       __ movdl($dst$$XMMRegister, $src$$Register);
3962       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3963       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3964       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3965       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3966     }
3967   %}
3968   ins_pipe( pipe_slow );
3969 %}
3970 
3971 instruct ReplL_reg_leg(legVec dst, eRegL src, legVec tmp) %{
3972   predicate(Matcher::vector_length(n) == 8);
3973   match(Set dst (ReplicateL src));
3974   effect(TEMP dst, USE src, TEMP tmp);
3975   format %{ "replicateL $dst,$src" %}
3976   ins_encode %{
3977     if (VM_Version::supports_avx512vl()) {
3978       __ movdl($dst$$XMMRegister, $src$$Register);
3979       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3980       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3981       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3982       __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3983       __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3984     } else {
3985       int vlen_enc = Assembler::AVX_512bit;
3986       __ movdl($dst$$XMMRegister, $src$$Register);
3987       __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3988       __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3989       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
3990     }
3991   %}
3992   ins_pipe( pipe_slow );
3993 %}
3994 #endif // _LP64
3995 
3996 instruct ReplL_mem(vec dst, memory mem) %{
3997   match(Set dst (ReplicateL (LoadL mem)));
3998   format %{ "replicateL $dst,$mem" %}
3999   ins_encode %{
4000     uint vlen = Matcher::vector_length(this);
4001     if (vlen == 2) {
4002       __ movq($dst$$XMMRegister, $mem$$Address);
4003       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4004     } else {
4005       assert(VM_Version::supports_avx2(), "sanity");
4006       int vlen_enc = vector_length_encoding(this);
4007       __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vlen_enc);
4008     }
4009   %}
4010   ins_pipe( pipe_slow );
4011 %}
4012 
4013 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4014 instruct ReplL_imm(vec dst, immL con) %{
4015   match(Set dst (ReplicateL con));
4016   format %{ "replicateL $dst,$con" %}
4017   ins_encode %{
4018     uint vlen = Matcher::vector_length(this);
4019     InternalAddress const_addr = $constantaddress($con);
4020     if (vlen == 2) {
4021       __ movq($dst$$XMMRegister, const_addr);
4022       __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4023     } else {
4024       assert(VM_Version::supports_avx2(), "sanity");
4025       int vlen_enc = vector_length_encoding(this);
4026       __ movq($dst$$XMMRegister, const_addr);
4027       __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4028     }
4029   %}
4030   ins_pipe( pipe_slow );
4031 %}
4032 
4033 instruct ReplL_zero(vec dst, immL0 zero) %{
4034   match(Set dst (ReplicateL zero));
4035   format %{ "replicateL $dst,$zero" %}
4036   ins_encode %{
4037     int vlen = Matcher::vector_length(this);
4038     if (vlen == 2) {
4039       __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4040     } else {
4041       int vlen_enc = vector_length_encoding(this);
4042       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
4043     }
4044   %}
4045   ins_pipe( fpu_reg_reg );
4046 %}
4047 
4048 instruct ReplL_M1(vec dst, immL_M1 con) %{
4049   predicate(UseAVX > 0);
4050   match(Set dst (ReplicateL con));
4051   effect(TEMP dst);
4052   format %{ "vallones $dst" %}
4053   ins_encode %{
4054     int vector_len = vector_length_encoding(this);
4055     __ vallones($dst$$XMMRegister, vector_len);
4056   %}
4057   ins_pipe( pipe_slow );
4058 %}
4059 
4060 // ====================ReplicateF=======================================
4061 
4062 instruct ReplF_reg(vec dst, vlRegF src) %{
4063   match(Set dst (ReplicateF src));
4064   format %{ "replicateF $dst,$src" %}
4065   ins_encode %{
4066     uint vlen = Matcher::vector_length(this);
4067     if (vlen <= 4) {
4068       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4069    } else if (VM_Version::supports_avx2()) {
4070       int vlen_enc = vector_length_encoding(this);
4071       __ vbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4072     } else {
4073       assert(vlen == 8, "sanity");
4074       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4075       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4076     }
4077   %}
4078   ins_pipe( pipe_slow );
4079 %}
4080 
4081 instruct ReplF_mem(vec dst, memory mem) %{
4082   match(Set dst (ReplicateF (LoadF mem)));
4083   format %{ "replicateF $dst,$mem" %}
4084   ins_encode %{
4085     uint vlen = Matcher::vector_length(this);
4086     if (vlen <= 4) {
4087       __ movdl($dst$$XMMRegister, $mem$$Address);
4088       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4089     } else {
4090       assert(VM_Version::supports_avx(), "sanity");
4091       int vlen_enc = vector_length_encoding(this);
4092       __ vbroadcastss($dst$$XMMRegister, $mem$$Address, vlen_enc);
4093     }
4094   %}
4095   ins_pipe( pipe_slow );
4096 %}
4097 
4098 instruct ReplF_zero(vec dst, immF0 zero) %{
4099   match(Set dst (ReplicateF zero));
4100   format %{ "replicateF $dst,$zero" %}
4101   ins_encode %{
4102     uint vlen = Matcher::vector_length(this);
4103     if (vlen <= 4) {
4104       __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
4105     } else {
4106       int vlen_enc = vector_length_encoding(this);
4107       __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4108     }
4109   %}
4110   ins_pipe( fpu_reg_reg );
4111 %}
4112 
4113 // ====================ReplicateD=======================================
4114 
4115 // Replicate double (8 bytes) scalar to be vector
4116 instruct ReplD_reg(vec dst, vlRegD src) %{
4117   match(Set dst (ReplicateD src));
4118   format %{ "replicateD $dst,$src" %}
4119   ins_encode %{
4120     uint vlen = Matcher::vector_length(this);
4121     if (vlen == 2) {
4122       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4123     } else if (VM_Version::supports_avx2()) {
4124       int vlen_enc = vector_length_encoding(this);
4125       __ vbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc); // reg-to-reg variant requires AVX2
4126     } else {
4127       assert(vlen == 4, "sanity");
4128       __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4129       __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
4130     }
4131   %}
4132   ins_pipe( pipe_slow );
4133 %}
4134 
4135 instruct ReplD_mem(vec dst, memory mem) %{
4136   match(Set dst (ReplicateD (LoadD mem)));
4137   format %{ "replicateD $dst,$mem" %}
4138   ins_encode %{
4139     uint vlen = Matcher::vector_length(this);
4140     if (vlen == 2) {
4141       __ movq($dst$$XMMRegister, $mem$$Address);
4142       __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x44);
4143     } else {
4144       assert(VM_Version::supports_avx(), "sanity");
4145       int vlen_enc = vector_length_encoding(this);
4146       __ vbroadcastsd($dst$$XMMRegister, $mem$$Address, vlen_enc);
4147     }
4148   %}
4149   ins_pipe( pipe_slow );
4150 %}
4151 
4152 instruct ReplD_zero(vec dst, immD0 zero) %{
4153   match(Set dst (ReplicateD zero));
4154   format %{ "replicateD $dst,$zero" %}
4155   ins_encode %{
4156     uint vlen = Matcher::vector_length(this);
4157     if (vlen == 2) {
4158       __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
4159     } else {
4160       int vlen_enc = vector_length_encoding(this);
4161       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc); // 512bit vxorps requires AVX512DQ
4162     }
4163   %}
4164   ins_pipe( fpu_reg_reg );
4165 %}
4166 
4167 // ====================VECTOR INSERT=======================================
4168 
4169 instruct insert(vec dst, rRegI val, immU8 idx) %{
4170   predicate(Matcher::vector_length_in_bytes(n) < 32);
4171   match(Set dst (VectorInsert (Binary dst val) idx));
4172   format %{ "vector_insert $dst,$val,$idx" %}
4173   ins_encode %{
4174     assert(UseSSE >= 4, "required");
4175     assert(Matcher::vector_length_in_bytes(this) >= 8, "required");
4176 
4177     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4178 
4179     assert(is_integral_type(elem_bt), "");
4180     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4181 
4182     __ insert(elem_bt, $dst$$XMMRegister, $val$$Register, $idx$$constant);
4183   %}
4184   ins_pipe( pipe_slow );
4185 %}
4186 
4187 instruct insert32(vec dst, vec src, rRegI val, immU8 idx, vec vtmp) %{
4188   predicate(Matcher::vector_length_in_bytes(n) == 32);
4189   match(Set dst (VectorInsert (Binary src val) idx));
4190   effect(TEMP vtmp);
4191   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4192   ins_encode %{
4193     int vlen_enc = Assembler::AVX_256bit;
4194     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4195     int elem_per_lane = 16/type2aelembytes(elem_bt);
4196     int log2epr = log2(elem_per_lane);
4197 
4198     assert(is_integral_type(elem_bt), "sanity");
4199     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4200 
4201     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4202     uint y_idx = ($idx$$constant >> log2epr) & 1;
4203     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4204     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4205     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4206   %}
4207   ins_pipe( pipe_slow );
4208 %}
4209 
4210 instruct insert64(vec dst, vec src, rRegI val, immU8 idx, legVec vtmp) %{
4211   predicate(Matcher::vector_length_in_bytes(n) == 64);
4212   match(Set dst (VectorInsert (Binary src val) idx));
4213   effect(TEMP vtmp);
4214   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4215   ins_encode %{
4216     assert(UseAVX > 2, "sanity");
4217 
4218     BasicType elem_bt = Matcher::vector_element_basic_type(this);
4219     int elem_per_lane = 16/type2aelembytes(elem_bt);
4220     int log2epr = log2(elem_per_lane);
4221 
4222     assert(is_integral_type(elem_bt), "");
4223     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4224 
4225     uint x_idx = $idx$$constant & right_n_bits(log2epr);
4226     uint y_idx = ($idx$$constant >> log2epr) & 3;
4227     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4228     __ vinsert(elem_bt, $vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4229     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4230   %}
4231   ins_pipe( pipe_slow );
4232 %}
4233 
4234 #ifdef _LP64
4235 instruct insert2L(vec dst, rRegL val, immU8 idx) %{
4236   predicate(Matcher::vector_length(n) == 2);
4237   match(Set dst (VectorInsert (Binary dst val) idx));
4238   format %{ "vector_insert $dst,$val,$idx" %}
4239   ins_encode %{
4240     assert(UseSSE >= 4, "required");
4241     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4242     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4243 
4244     __ pinsrq($dst$$XMMRegister, $val$$Register, $idx$$constant);
4245   %}
4246   ins_pipe( pipe_slow );
4247 %}
4248 
4249 instruct insert4L(vec dst, vec src, rRegL val, immU8 idx, vec vtmp) %{
4250   predicate(Matcher::vector_length(n) == 4);
4251   match(Set dst (VectorInsert (Binary src val) idx));
4252   effect(TEMP vtmp);
4253   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4254   ins_encode %{
4255     assert(Matcher::vector_element_basic_type(this) == T_LONG, "");
4256     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4257 
4258     uint x_idx = $idx$$constant & right_n_bits(1);
4259     uint y_idx = ($idx$$constant >> 1) & 1;
4260     int vlen_enc = Assembler::AVX_256bit;
4261     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4262     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4263     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4264   %}
4265   ins_pipe( pipe_slow );
4266 %}
4267 
4268 instruct insert8L(vec dst, vec src, rRegL val, immU8 idx, legVec vtmp) %{
4269   predicate(Matcher::vector_length(n) == 8);
4270   match(Set dst (VectorInsert (Binary src val) idx));
4271   effect(TEMP vtmp);
4272   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4273   ins_encode %{
4274     assert(Matcher::vector_element_basic_type(this) == T_LONG, "sanity");
4275     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4276 
4277     uint x_idx = $idx$$constant & right_n_bits(1);
4278     uint y_idx = ($idx$$constant >> 1) & 3;
4279     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4280     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$Register, x_idx);
4281     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4282   %}
4283   ins_pipe( pipe_slow );
4284 %}
4285 #endif
4286 
4287 instruct insertF(vec dst, regF val, immU8 idx) %{
4288   predicate(Matcher::vector_length(n) < 8);
4289   match(Set dst (VectorInsert (Binary dst val) idx));
4290   format %{ "vector_insert $dst,$val,$idx" %}
4291   ins_encode %{
4292     assert(UseSSE >= 4, "sanity");
4293 
4294     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4295     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4296 
4297     __ insertps($dst$$XMMRegister, $val$$XMMRegister, $idx$$constant);
4298   %}
4299   ins_pipe( pipe_slow );
4300 %}
4301 
4302 instruct vinsertF(vec dst, vec src, regF val, immU8 idx, vec vtmp) %{
4303   predicate(Matcher::vector_length(n) >= 8);
4304   match(Set dst (VectorInsert (Binary src val) idx));
4305   effect(TEMP vtmp);
4306   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4307   ins_encode %{
4308     assert(Matcher::vector_element_basic_type(this) == T_FLOAT, "sanity");
4309     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4310 
4311     int vlen = Matcher::vector_length(this);
4312     uint x_idx = $idx$$constant & right_n_bits(2);
4313     if (vlen == 8) {
4314       uint y_idx = ($idx$$constant >> 2) & 1;
4315       int vlen_enc = Assembler::AVX_256bit;
4316       __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4317       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4318       __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4319     } else {
4320       assert(vlen == 16, "sanity");
4321       uint y_idx = ($idx$$constant >> 2) & 3;
4322       __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4323       __ vinsertps($vtmp$$XMMRegister, $vtmp$$XMMRegister, $val$$XMMRegister, x_idx);
4324       __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4325     }
4326   %}
4327   ins_pipe( pipe_slow );
4328 %}
4329 
4330 #ifdef _LP64
4331 instruct insert2D(vec dst, regD val, immU8 idx, rRegL tmp) %{
4332   predicate(Matcher::vector_length(n) == 2);
4333   match(Set dst (VectorInsert (Binary dst val) idx));
4334   effect(TEMP tmp);
4335   format %{ "vector_insert $dst,$val,$idx\t!using $tmp as TEMP" %}
4336   ins_encode %{
4337     assert(UseSSE >= 4, "sanity");
4338     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4339     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4340 
4341     __ movq($tmp$$Register, $val$$XMMRegister);
4342     __ pinsrq($dst$$XMMRegister, $tmp$$Register, $idx$$constant);
4343   %}
4344   ins_pipe( pipe_slow );
4345 %}
4346 
4347 instruct insert4D(vec dst, vec src, regD val, immU8 idx, rRegL tmp, vec vtmp) %{
4348   predicate(Matcher::vector_length(n) == 4);
4349   match(Set dst (VectorInsert (Binary src val) idx));
4350   effect(TEMP vtmp, TEMP tmp);
4351   format %{ "vector_insert $dst,$src,$val,$idx\t!using $tmp, $vtmp as TEMP" %}
4352   ins_encode %{
4353     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4354     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4355 
4356     uint x_idx = $idx$$constant & right_n_bits(1);
4357     uint y_idx = ($idx$$constant >> 1) & 1;
4358     int vlen_enc = Assembler::AVX_256bit;
4359     __ movq($tmp$$Register, $val$$XMMRegister);
4360     __ vextracti128($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4361     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4362     __ vinserti128($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4363   %}
4364   ins_pipe( pipe_slow );
4365 %}
4366 
4367 instruct insert8D(vec dst, vec src, regD val, immI idx, rRegL tmp, legVec vtmp) %{
4368   predicate(Matcher::vector_length(n) == 8);
4369   match(Set dst (VectorInsert (Binary src val) idx));
4370   effect(TEMP tmp, TEMP vtmp);
4371   format %{ "vector_insert $dst,$src,$val,$idx\t!using $vtmp as TEMP" %}
4372   ins_encode %{
4373     assert(Matcher::vector_element_basic_type(this) == T_DOUBLE, "sanity");
4374     assert($idx$$constant < (int)Matcher::vector_length(this), "out of bounds");
4375 
4376     uint x_idx = $idx$$constant & right_n_bits(1);
4377     uint y_idx = ($idx$$constant >> 1) & 3;
4378     __ movq($tmp$$Register, $val$$XMMRegister);
4379     __ vextracti32x4($vtmp$$XMMRegister, $src$$XMMRegister, y_idx);
4380     __ vpinsrq($vtmp$$XMMRegister, $vtmp$$XMMRegister, $tmp$$Register, x_idx);
4381     __ vinserti32x4($dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister, y_idx);
4382   %}
4383   ins_pipe( pipe_slow );
4384 %}
4385 #endif
4386 
4387 // ====================REDUCTION ARITHMETIC=======================================
4388 
4389 // =======================Int Reduction==========================================
4390 
4391 instruct reductionI(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4392   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_INT); // src2
4393   match(Set dst (AddReductionVI src1 src2));
4394   match(Set dst (MulReductionVI src1 src2));
4395   match(Set dst (AndReductionV  src1 src2));
4396   match(Set dst ( OrReductionV  src1 src2));
4397   match(Set dst (XorReductionV  src1 src2));
4398   match(Set dst (MinReductionV  src1 src2));
4399   match(Set dst (MaxReductionV  src1 src2));
4400   effect(TEMP vtmp1, TEMP vtmp2);
4401   format %{ "vector_reduction_int $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4402   ins_encode %{
4403     int opcode = this->ideal_Opcode();
4404     int vlen = Matcher::vector_length(this, $src2);
4405     __ reduceI(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4406   %}
4407   ins_pipe( pipe_slow );
4408 %}
4409 
4410 // =======================Long Reduction==========================================
4411 
4412 #ifdef _LP64
4413 instruct reductionL(rRegL dst, rRegL src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4414   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && !VM_Version::supports_avx512dq());
4415   match(Set dst (AddReductionVL src1 src2));
4416   match(Set dst (MulReductionVL src1 src2));
4417   match(Set dst (AndReductionV  src1 src2));
4418   match(Set dst ( OrReductionV  src1 src2));
4419   match(Set dst (XorReductionV  src1 src2));
4420   match(Set dst (MinReductionV  src1 src2));
4421   match(Set dst (MaxReductionV  src1 src2));
4422   effect(TEMP vtmp1, TEMP vtmp2);
4423   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4424   ins_encode %{
4425     int opcode = this->ideal_Opcode();
4426     int vlen = Matcher::vector_length(this, $src2);
4427     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4428   %}
4429   ins_pipe( pipe_slow );
4430 %}
4431 
4432 instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtmp2) %{
4433   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_LONG && VM_Version::supports_avx512dq());
4434   match(Set dst (AddReductionVL src1 src2));
4435   match(Set dst (MulReductionVL src1 src2));
4436   match(Set dst (AndReductionV  src1 src2));
4437   match(Set dst ( OrReductionV  src1 src2));
4438   match(Set dst (XorReductionV  src1 src2));
4439   match(Set dst (MinReductionV  src1 src2));
4440   match(Set dst (MaxReductionV  src1 src2));
4441   effect(TEMP vtmp1, TEMP vtmp2);
4442   format %{ "vector_reduction_long $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4443   ins_encode %{
4444     int opcode = this->ideal_Opcode();
4445     int vlen = Matcher::vector_length(this, $src2);
4446     __ reduceL(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4447   %}
4448   ins_pipe( pipe_slow );
4449 %}
4450 #endif // _LP64
4451 
4452 // =======================Float Reduction==========================================
4453 
4454 instruct reductionF128(regF dst, vec src, vec vtmp) %{
4455   predicate(Matcher::vector_length(n->in(2)) <= 4); // src
4456   match(Set dst (AddReductionVF dst src));
4457   match(Set dst (MulReductionVF dst src));
4458   effect(TEMP dst, TEMP vtmp);
4459   format %{ "vector_reduction_float  $dst,$src ; using $vtmp as TEMP" %}
4460   ins_encode %{
4461     int opcode = this->ideal_Opcode();
4462     int vlen = Matcher::vector_length(this, $src);
4463     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4464   %}
4465   ins_pipe( pipe_slow );
4466 %}
4467 
4468 instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
4469   predicate(Matcher::vector_length(n->in(2)) == 8); // src
4470   match(Set dst (AddReductionVF dst src));
4471   match(Set dst (MulReductionVF dst src));
4472   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4473   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4474   ins_encode %{
4475     int opcode = this->ideal_Opcode();
4476     int vlen = Matcher::vector_length(this, $src);
4477     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4478   %}
4479   ins_pipe( pipe_slow );
4480 %}
4481 
4482 instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4483   predicate(Matcher::vector_length(n->in(2)) == 16); // src
4484   match(Set dst (AddReductionVF dst src));
4485   match(Set dst (MulReductionVF dst src));
4486   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4487   format %{ "vector_reduction_float $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4488   ins_encode %{
4489     int opcode = this->ideal_Opcode();
4490     int vlen = Matcher::vector_length(this, $src);
4491     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4492   %}
4493   ins_pipe( pipe_slow );
4494 %}
4495 
4496 // =======================Double Reduction==========================================
4497 
4498 instruct reduction2D(regD dst, vec src, vec vtmp) %{
4499   predicate(Matcher::vector_length(n->in(2)) == 2); // src
4500   match(Set dst (AddReductionVD dst src));
4501   match(Set dst (MulReductionVD dst src));
4502   effect(TEMP dst, TEMP vtmp);
4503   format %{ "vector_reduction_double $dst,$src ; using $vtmp as TEMP" %}
4504   ins_encode %{
4505     int opcode = this->ideal_Opcode();
4506     int vlen = Matcher::vector_length(this, $src);
4507     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp$$XMMRegister);
4508 %}
4509   ins_pipe( pipe_slow );
4510 %}
4511 
4512 instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
4513   predicate(Matcher::vector_length(n->in(2)) == 4); // src
4514   match(Set dst (AddReductionVD dst src));
4515   match(Set dst (MulReductionVD dst src));
4516   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4517   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4518   ins_encode %{
4519     int opcode = this->ideal_Opcode();
4520     int vlen = Matcher::vector_length(this, $src);
4521     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4522   %}
4523   ins_pipe( pipe_slow );
4524 %}
4525 
4526 instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
4527   predicate(Matcher::vector_length(n->in(2)) == 8); // src
4528   match(Set dst (AddReductionVD dst src));
4529   match(Set dst (MulReductionVD dst src));
4530   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4531   format %{ "vector_reduction_double $dst,$src ; using $vtmp1, $vtmp2 as TEMP" %}
4532   ins_encode %{
4533     int opcode = this->ideal_Opcode();
4534     int vlen = Matcher::vector_length(this, $src);
4535     __ reduce_fp(opcode, vlen, $dst$$XMMRegister, $src$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4536   %}
4537   ins_pipe( pipe_slow );
4538 %}
4539 
4540 // =======================Byte Reduction==========================================
4541 
4542 #ifdef _LP64
4543 instruct reductionB(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4544   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && !VM_Version::supports_avx512bw());
4545   match(Set dst (AddReductionVI src1 src2));
4546   match(Set dst (AndReductionV  src1 src2));
4547   match(Set dst ( OrReductionV  src1 src2));
4548   match(Set dst (XorReductionV  src1 src2));
4549   match(Set dst (MinReductionV  src1 src2));
4550   match(Set dst (MaxReductionV  src1 src2));
4551   effect(TEMP vtmp1, TEMP vtmp2);
4552   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4553   ins_encode %{
4554     int opcode = this->ideal_Opcode();
4555     int vlen = Matcher::vector_length(this, $src2);
4556     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4557   %}
4558   ins_pipe( pipe_slow );
4559 %}
4560 
4561 instruct reductionB_avx512bw(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4562   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE && VM_Version::supports_avx512bw());
4563   match(Set dst (AddReductionVI src1 src2));
4564   match(Set dst (AndReductionV  src1 src2));
4565   match(Set dst ( OrReductionV  src1 src2));
4566   match(Set dst (XorReductionV  src1 src2));
4567   match(Set dst (MinReductionV  src1 src2));
4568   match(Set dst (MaxReductionV  src1 src2));
4569   effect(TEMP vtmp1, TEMP vtmp2);
4570   format %{ "vector_reduction_byte $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4571   ins_encode %{
4572     int opcode = this->ideal_Opcode();
4573     int vlen = Matcher::vector_length(this, $src2);
4574     __ reduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4575   %}
4576   ins_pipe( pipe_slow );
4577 %}
4578 #endif
4579 
4580 // =======================Short Reduction==========================================
4581 
4582 instruct reductionS(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4583   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_SHORT); // src2
4584   match(Set dst (AddReductionVI src1 src2));
4585   match(Set dst (MulReductionVI src1 src2));
4586   match(Set dst (AndReductionV  src1 src2));
4587   match(Set dst ( OrReductionV  src1 src2));
4588   match(Set dst (XorReductionV  src1 src2));
4589   match(Set dst (MinReductionV  src1 src2));
4590   match(Set dst (MaxReductionV  src1 src2));
4591   effect(TEMP vtmp1, TEMP vtmp2);
4592   format %{ "vector_reduction_short $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
4593   ins_encode %{
4594     int opcode = this->ideal_Opcode();
4595     int vlen = Matcher::vector_length(this, $src2);
4596     __ reduceS(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4597   %}
4598   ins_pipe( pipe_slow );
4599 %}
4600 
4601 // =======================Mul Reduction==========================================
4602 
4603 instruct mul_reductionB(rRegI dst, rRegI src1, vec src2, vec vtmp1, vec vtmp2) %{
4604   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
4605             Matcher::vector_length(n->in(2)) <= 32); // src2
4606   match(Set dst (MulReductionVI src1 src2));
4607   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4608   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4609   ins_encode %{
4610     int opcode = this->ideal_Opcode();
4611     int vlen = Matcher::vector_length(this, $src2);
4612     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4613   %}
4614   ins_pipe( pipe_slow );
4615 %}
4616 
4617 instruct mul_reduction64B(rRegI dst, rRegI src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
4618   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_BYTE &&
4619             Matcher::vector_length(n->in(2)) == 64); // src2
4620   match(Set dst (MulReductionVI src1 src2));
4621   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
4622   format %{ "vector_mul_reduction_byte $dst,$src1,$src2; using $vtmp1, $vtmp2 as TEMP" %}
4623   ins_encode %{
4624     int opcode = this->ideal_Opcode();
4625     int vlen = Matcher::vector_length(this, $src2);
4626     __ mulreduceB(opcode, vlen, $dst$$Register, $src1$$Register, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
4627   %}
4628   ins_pipe( pipe_slow );
4629 %}
4630 
4631 //--------------------Min/Max Float Reduction --------------------
4632 // Float Min Reduction
4633 instruct minmax_reduction2F(legRegF dst, immF src1, legVec src2, legVec tmp,
4634                             legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4635   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4636             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4637              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4638             Matcher::vector_length(n->in(2)) == 2);
4639   match(Set dst (MinReductionV src1 src2));
4640   match(Set dst (MaxReductionV src1 src2));
4641   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4642   format %{ "vector_minmax2F_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4643   ins_encode %{
4644     assert(UseAVX > 0, "sanity");
4645 
4646     int opcode = this->ideal_Opcode();
4647     int vlen = Matcher::vector_length(this, $src2);
4648     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4649                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4650   %}
4651   ins_pipe( pipe_slow );
4652 %}
4653 
4654 instruct minmax_reductionF(legRegF dst, immF src1, legVec src2, legVec tmp, legVec atmp,
4655                            legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4656   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4657             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeF::POS_INF) ||
4658              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeF::NEG_INF)) &&
4659             Matcher::vector_length(n->in(2)) >= 4);
4660   match(Set dst (MinReductionV src1 src2));
4661   match(Set dst (MaxReductionV src1 src2));
4662   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4663   format %{ "vector_minmaxF_reduction $dst,$src1,$src2  ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4664   ins_encode %{
4665     assert(UseAVX > 0, "sanity");
4666 
4667     int opcode = this->ideal_Opcode();
4668     int vlen = Matcher::vector_length(this, $src2);
4669     __ reduceFloatMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister, $tmp$$XMMRegister,
4670                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4671   %}
4672   ins_pipe( pipe_slow );
4673 %}
4674 
4675 instruct minmax_reduction2F_av(legRegF dst, legVec src, legVec tmp,
4676                                legVec atmp, legVec btmp, legVec xmm_1, rFlagsReg cr) %{
4677   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4678             Matcher::vector_length(n->in(2)) == 2);
4679   match(Set dst (MinReductionV dst src));
4680   match(Set dst (MaxReductionV dst src));
4681   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_1, KILL cr);
4682   format %{ "vector_minmax2F_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_1 as TEMP" %}
4683   ins_encode %{
4684     assert(UseAVX > 0, "sanity");
4685 
4686     int opcode = this->ideal_Opcode();
4687     int vlen = Matcher::vector_length(this, $src);
4688     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4689                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_1$$XMMRegister);
4690   %}
4691   ins_pipe( pipe_slow );
4692 %}
4693 
4694 
4695 instruct minmax_reductionF_av(legRegF dst, legVec src, legVec tmp,
4696                               legVec atmp, legVec btmp, legVec xmm_0, legVec xmm_1, rFlagsReg cr) %{
4697   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_FLOAT &&
4698             Matcher::vector_length(n->in(2)) >= 4);
4699   match(Set dst (MinReductionV dst src));
4700   match(Set dst (MaxReductionV dst src));
4701   effect(TEMP dst, TEMP tmp, TEMP atmp, TEMP btmp, TEMP xmm_0, TEMP xmm_1, KILL cr);
4702   format %{ "vector_minmaxF_reduction $dst,$src ; using $tmp, $atmp, $btmp, $xmm_0, $xmm_1 as TEMP" %}
4703   ins_encode %{
4704     assert(UseAVX > 0, "sanity");
4705 
4706     int opcode = this->ideal_Opcode();
4707     int vlen = Matcher::vector_length(this, $src);
4708     __ reduceFloatMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister,
4709                          $atmp$$XMMRegister, $btmp$$XMMRegister, $xmm_0$$XMMRegister, $xmm_1$$XMMRegister);
4710   %}
4711   ins_pipe( pipe_slow );
4712 %}
4713 
4714 
4715 //--------------------Min Double Reduction --------------------
4716 instruct minmax_reduction2D(legRegD dst, immD src1, legVec src2,
4717                             legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
4718                             rFlagsReg cr) %{
4719   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4720             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
4721              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
4722             Matcher::vector_length(n->in(2)) == 2);
4723   match(Set dst (MinReductionV src1 src2));
4724   match(Set dst (MaxReductionV src1 src2));
4725   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
4726   format %{ "vector_minmax2D_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
4727   ins_encode %{
4728     assert(UseAVX > 0, "sanity");
4729 
4730     int opcode = this->ideal_Opcode();
4731     int vlen = Matcher::vector_length(this, $src2);
4732     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
4733                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
4734   %}
4735   ins_pipe( pipe_slow );
4736 %}
4737 
4738 instruct minmax_reductionD(legRegD dst, immD src1, legVec src2,
4739                            legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
4740                            rFlagsReg cr) %{
4741   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4742             ((n->Opcode() == Op_MinReductionV && n->in(1)->bottom_type() == TypeD::POS_INF) ||
4743              (n->Opcode() == Op_MaxReductionV && n->in(1)->bottom_type() == TypeD::NEG_INF)) &&
4744             Matcher::vector_length(n->in(2)) >= 4);
4745   match(Set dst (MinReductionV src1 src2));
4746   match(Set dst (MaxReductionV src1 src2));
4747   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
4748   format %{ "vector_minmaxD_reduction $dst,$src1,$src2 ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
4749   ins_encode %{
4750     assert(UseAVX > 0, "sanity");
4751 
4752     int opcode = this->ideal_Opcode();
4753     int vlen = Matcher::vector_length(this, $src2);
4754     __ reduceDoubleMinMax(opcode, vlen, false, $dst$$XMMRegister, $src2$$XMMRegister,
4755                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
4756   %}
4757   ins_pipe( pipe_slow );
4758 %}
4759 
4760 
4761 instruct minmax_reduction2D_av(legRegD dst, legVec src,
4762                                legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, // TEMPs
4763                                rFlagsReg cr) %{
4764   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4765             Matcher::vector_length(n->in(2)) == 2);
4766   match(Set dst (MinReductionV dst src));
4767   match(Set dst (MaxReductionV dst src));
4768   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr);
4769   format %{ "vector_minmax2D_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4 as TEMP" %}
4770   ins_encode %{
4771     assert(UseAVX > 0, "sanity");
4772 
4773     int opcode = this->ideal_Opcode();
4774     int vlen = Matcher::vector_length(this, $src);
4775     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
4776                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister);
4777   %}
4778   ins_pipe( pipe_slow );
4779 %}
4780 
4781 instruct minmax_reductionD_av(legRegD dst, legVec src,
4782                               legVec tmp1, legVec tmp2, legVec tmp3, legVec tmp4, legVec tmp5, // TEMPs
4783                               rFlagsReg cr) %{
4784   predicate(Matcher::vector_element_basic_type(n->in(2)) == T_DOUBLE &&
4785             Matcher::vector_length(n->in(2)) >= 4);
4786   match(Set dst (MinReductionV dst src));
4787   match(Set dst (MaxReductionV dst src));
4788   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, KILL cr);
4789   format %{ "vector_minmaxD_reduction $dst,$src ; using $tmp1, $tmp2, $tmp3, $tmp4, $tmp5 as TEMP" %}
4790   ins_encode %{
4791     assert(UseAVX > 0, "sanity");
4792 
4793     int opcode = this->ideal_Opcode();
4794     int vlen = Matcher::vector_length(this, $src);
4795     __ reduceDoubleMinMax(opcode, vlen, true, $dst$$XMMRegister, $src$$XMMRegister,
4796                           $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister, $tmp4$$XMMRegister, $tmp5$$XMMRegister);
4797   %}
4798   ins_pipe( pipe_slow );
4799 %}
4800 
4801 // ====================VECTOR ARITHMETIC=======================================
4802 
4803 // --------------------------------- ADD --------------------------------------
4804 
4805 // Bytes vector add
4806 instruct vaddB(vec dst, vec src) %{
4807   predicate(UseAVX == 0);
4808   match(Set dst (AddVB dst src));
4809   format %{ "paddb   $dst,$src\t! add packedB" %}
4810   ins_encode %{
4811     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
4812   %}
4813   ins_pipe( pipe_slow );
4814 %}
4815 
4816 instruct vaddB_reg(vec dst, vec src1, vec src2) %{
4817   predicate(UseAVX > 0);
4818   match(Set dst (AddVB src1 src2));
4819   format %{ "vpaddb  $dst,$src1,$src2\t! add packedB" %}
4820   ins_encode %{
4821     int vlen_enc = vector_length_encoding(this);
4822     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4823   %}
4824   ins_pipe( pipe_slow );
4825 %}
4826 
4827 instruct vaddB_mem(vec dst, vec src, memory mem) %{
4828   predicate((UseAVX > 0) &&
4829             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
4830   match(Set dst (AddVB src (LoadVector mem)));
4831   format %{ "vpaddb  $dst,$src,$mem\t! add packedB" %}
4832   ins_encode %{
4833     int vlen_enc = vector_length_encoding(this);
4834     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4835   %}
4836   ins_pipe( pipe_slow );
4837 %}
4838 
4839 // Shorts/Chars vector add
4840 instruct vaddS(vec dst, vec src) %{
4841   predicate(UseAVX == 0);
4842   match(Set dst (AddVS dst src));
4843   format %{ "paddw   $dst,$src\t! add packedS" %}
4844   ins_encode %{
4845     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
4846   %}
4847   ins_pipe( pipe_slow );
4848 %}
4849 
4850 instruct vaddS_reg(vec dst, vec src1, vec src2) %{
4851   predicate(UseAVX > 0);
4852   match(Set dst (AddVS src1 src2));
4853   format %{ "vpaddw  $dst,$src1,$src2\t! add packedS" %}
4854   ins_encode %{
4855     int vlen_enc = vector_length_encoding(this);
4856     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4857   %}
4858   ins_pipe( pipe_slow );
4859 %}
4860 
4861 instruct vaddS_mem(vec dst, vec src, memory mem) %{
4862   predicate((UseAVX > 0) &&
4863             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
4864   match(Set dst (AddVS src (LoadVector mem)));
4865   format %{ "vpaddw  $dst,$src,$mem\t! add packedS" %}
4866   ins_encode %{
4867     int vlen_enc = vector_length_encoding(this);
4868     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4869   %}
4870   ins_pipe( pipe_slow );
4871 %}
4872 
4873 // Integers vector add
4874 instruct vaddI(vec dst, vec src) %{
4875   predicate(UseAVX == 0);
4876   match(Set dst (AddVI dst src));
4877   format %{ "paddd   $dst,$src\t! add packedI" %}
4878   ins_encode %{
4879     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
4880   %}
4881   ins_pipe( pipe_slow );
4882 %}
4883 
4884 instruct vaddI_reg(vec dst, vec src1, vec src2) %{
4885   predicate(UseAVX > 0);
4886   match(Set dst (AddVI src1 src2));
4887   format %{ "vpaddd  $dst,$src1,$src2\t! add packedI" %}
4888   ins_encode %{
4889     int vlen_enc = vector_length_encoding(this);
4890     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4891   %}
4892   ins_pipe( pipe_slow );
4893 %}
4894 
4895 
4896 instruct vaddI_mem(vec dst, vec src, memory mem) %{
4897   predicate((UseAVX > 0) &&
4898             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
4899   match(Set dst (AddVI src (LoadVector mem)));
4900   format %{ "vpaddd  $dst,$src,$mem\t! add packedI" %}
4901   ins_encode %{
4902     int vlen_enc = vector_length_encoding(this);
4903     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4904   %}
4905   ins_pipe( pipe_slow );
4906 %}
4907 
4908 // Longs vector add
4909 instruct vaddL(vec dst, vec src) %{
4910   predicate(UseAVX == 0);
4911   match(Set dst (AddVL dst src));
4912   format %{ "paddq   $dst,$src\t! add packedL" %}
4913   ins_encode %{
4914     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
4915   %}
4916   ins_pipe( pipe_slow );
4917 %}
4918 
4919 instruct vaddL_reg(vec dst, vec src1, vec src2) %{
4920   predicate(UseAVX > 0);
4921   match(Set dst (AddVL src1 src2));
4922   format %{ "vpaddq  $dst,$src1,$src2\t! add packedL" %}
4923   ins_encode %{
4924     int vlen_enc = vector_length_encoding(this);
4925     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4926   %}
4927   ins_pipe( pipe_slow );
4928 %}
4929 
4930 instruct vaddL_mem(vec dst, vec src, memory mem) %{
4931   predicate((UseAVX > 0) &&
4932             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
4933   match(Set dst (AddVL src (LoadVector mem)));
4934   format %{ "vpaddq  $dst,$src,$mem\t! add packedL" %}
4935   ins_encode %{
4936     int vlen_enc = vector_length_encoding(this);
4937     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4938   %}
4939   ins_pipe( pipe_slow );
4940 %}
4941 
4942 // Floats vector add
4943 instruct vaddF(vec dst, vec src) %{
4944   predicate(UseAVX == 0);
4945   match(Set dst (AddVF dst src));
4946   format %{ "addps   $dst,$src\t! add packedF" %}
4947   ins_encode %{
4948     __ addps($dst$$XMMRegister, $src$$XMMRegister);
4949   %}
4950   ins_pipe( pipe_slow );
4951 %}
4952 
4953 instruct vaddF_reg(vec dst, vec src1, vec src2) %{
4954   predicate(UseAVX > 0);
4955   match(Set dst (AddVF src1 src2));
4956   format %{ "vaddps  $dst,$src1,$src2\t! add packedF" %}
4957   ins_encode %{
4958     int vlen_enc = vector_length_encoding(this);
4959     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4960   %}
4961   ins_pipe( pipe_slow );
4962 %}
4963 
4964 instruct vaddF_mem(vec dst, vec src, memory mem) %{
4965   predicate((UseAVX > 0) &&
4966             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
4967   match(Set dst (AddVF src (LoadVector mem)));
4968   format %{ "vaddps  $dst,$src,$mem\t! add packedF" %}
4969   ins_encode %{
4970     int vlen_enc = vector_length_encoding(this);
4971     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
4972   %}
4973   ins_pipe( pipe_slow );
4974 %}
4975 
4976 // Doubles vector add
4977 instruct vaddD(vec dst, vec src) %{
4978   predicate(UseAVX == 0);
4979   match(Set dst (AddVD dst src));
4980   format %{ "addpd   $dst,$src\t! add packedD" %}
4981   ins_encode %{
4982     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
4983   %}
4984   ins_pipe( pipe_slow );
4985 %}
4986 
4987 instruct vaddD_reg(vec dst, vec src1, vec src2) %{
4988   predicate(UseAVX > 0);
4989   match(Set dst (AddVD src1 src2));
4990   format %{ "vaddpd  $dst,$src1,$src2\t! add packedD" %}
4991   ins_encode %{
4992     int vlen_enc = vector_length_encoding(this);
4993     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
4994   %}
4995   ins_pipe( pipe_slow );
4996 %}
4997 
4998 instruct vaddD_mem(vec dst, vec src, memory mem) %{
4999   predicate((UseAVX > 0) &&
5000             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5001   match(Set dst (AddVD src (LoadVector mem)));
5002   format %{ "vaddpd  $dst,$src,$mem\t! add packedD" %}
5003   ins_encode %{
5004     int vlen_enc = vector_length_encoding(this);
5005     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5006   %}
5007   ins_pipe( pipe_slow );
5008 %}
5009 
5010 // --------------------------------- SUB --------------------------------------
5011 
5012 // Bytes vector sub
5013 instruct vsubB(vec dst, vec src) %{
5014   predicate(UseAVX == 0);
5015   match(Set dst (SubVB dst src));
5016   format %{ "psubb   $dst,$src\t! sub packedB" %}
5017   ins_encode %{
5018     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
5019   %}
5020   ins_pipe( pipe_slow );
5021 %}
5022 
5023 instruct vsubB_reg(vec dst, vec src1, vec src2) %{
5024   predicate(UseAVX > 0);
5025   match(Set dst (SubVB src1 src2));
5026   format %{ "vpsubb  $dst,$src1,$src2\t! sub packedB" %}
5027   ins_encode %{
5028     int vlen_enc = vector_length_encoding(this);
5029     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5030   %}
5031   ins_pipe( pipe_slow );
5032 %}
5033 
5034 instruct vsubB_mem(vec dst, vec src, memory mem) %{
5035   predicate((UseAVX > 0) &&
5036             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5037   match(Set dst (SubVB src (LoadVector mem)));
5038   format %{ "vpsubb  $dst,$src,$mem\t! sub packedB" %}
5039   ins_encode %{
5040     int vlen_enc = vector_length_encoding(this);
5041     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5042   %}
5043   ins_pipe( pipe_slow );
5044 %}
5045 
5046 // Shorts/Chars vector sub
5047 instruct vsubS(vec dst, vec src) %{
5048   predicate(UseAVX == 0);
5049   match(Set dst (SubVS dst src));
5050   format %{ "psubw   $dst,$src\t! sub packedS" %}
5051   ins_encode %{
5052     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
5053   %}
5054   ins_pipe( pipe_slow );
5055 %}
5056 
5057 
5058 instruct vsubS_reg(vec dst, vec src1, vec src2) %{
5059   predicate(UseAVX > 0);
5060   match(Set dst (SubVS src1 src2));
5061   format %{ "vpsubw  $dst,$src1,$src2\t! sub packedS" %}
5062   ins_encode %{
5063     int vlen_enc = vector_length_encoding(this);
5064     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5065   %}
5066   ins_pipe( pipe_slow );
5067 %}
5068 
5069 instruct vsubS_mem(vec dst, vec src, memory mem) %{
5070   predicate((UseAVX > 0) &&
5071             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5072   match(Set dst (SubVS src (LoadVector mem)));
5073   format %{ "vpsubw  $dst,$src,$mem\t! sub packedS" %}
5074   ins_encode %{
5075     int vlen_enc = vector_length_encoding(this);
5076     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5077   %}
5078   ins_pipe( pipe_slow );
5079 %}
5080 
5081 // Integers vector sub
5082 instruct vsubI(vec dst, vec src) %{
5083   predicate(UseAVX == 0);
5084   match(Set dst (SubVI dst src));
5085   format %{ "psubd   $dst,$src\t! sub packedI" %}
5086   ins_encode %{
5087     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
5088   %}
5089   ins_pipe( pipe_slow );
5090 %}
5091 
5092 instruct vsubI_reg(vec dst, vec src1, vec src2) %{
5093   predicate(UseAVX > 0);
5094   match(Set dst (SubVI src1 src2));
5095   format %{ "vpsubd  $dst,$src1,$src2\t! sub packedI" %}
5096   ins_encode %{
5097     int vlen_enc = vector_length_encoding(this);
5098     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5099   %}
5100   ins_pipe( pipe_slow );
5101 %}
5102 
5103 instruct vsubI_mem(vec dst, vec src, memory mem) %{
5104   predicate((UseAVX > 0) &&
5105             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5106   match(Set dst (SubVI src (LoadVector mem)));
5107   format %{ "vpsubd  $dst,$src,$mem\t! sub packedI" %}
5108   ins_encode %{
5109     int vlen_enc = vector_length_encoding(this);
5110     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5111   %}
5112   ins_pipe( pipe_slow );
5113 %}
5114 
5115 // Longs vector sub
5116 instruct vsubL(vec dst, vec src) %{
5117   predicate(UseAVX == 0);
5118   match(Set dst (SubVL dst src));
5119   format %{ "psubq   $dst,$src\t! sub packedL" %}
5120   ins_encode %{
5121     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
5122   %}
5123   ins_pipe( pipe_slow );
5124 %}
5125 
5126 instruct vsubL_reg(vec dst, vec src1, vec src2) %{
5127   predicate(UseAVX > 0);
5128   match(Set dst (SubVL src1 src2));
5129   format %{ "vpsubq  $dst,$src1,$src2\t! sub packedL" %}
5130   ins_encode %{
5131     int vlen_enc = vector_length_encoding(this);
5132     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5133   %}
5134   ins_pipe( pipe_slow );
5135 %}
5136 
5137 
5138 instruct vsubL_mem(vec dst, vec src, memory mem) %{
5139   predicate((UseAVX > 0) &&
5140             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5141   match(Set dst (SubVL src (LoadVector mem)));
5142   format %{ "vpsubq  $dst,$src,$mem\t! sub packedL" %}
5143   ins_encode %{
5144     int vlen_enc = vector_length_encoding(this);
5145     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5146   %}
5147   ins_pipe( pipe_slow );
5148 %}
5149 
5150 // Floats vector sub
5151 instruct vsubF(vec dst, vec src) %{
5152   predicate(UseAVX == 0);
5153   match(Set dst (SubVF dst src));
5154   format %{ "subps   $dst,$src\t! sub packedF" %}
5155   ins_encode %{
5156     __ subps($dst$$XMMRegister, $src$$XMMRegister);
5157   %}
5158   ins_pipe( pipe_slow );
5159 %}
5160 
5161 instruct vsubF_reg(vec dst, vec src1, vec src2) %{
5162   predicate(UseAVX > 0);
5163   match(Set dst (SubVF src1 src2));
5164   format %{ "vsubps  $dst,$src1,$src2\t! sub packedF" %}
5165   ins_encode %{
5166     int vlen_enc = vector_length_encoding(this);
5167     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5168   %}
5169   ins_pipe( pipe_slow );
5170 %}
5171 
5172 instruct vsubF_mem(vec dst, vec src, memory mem) %{
5173   predicate((UseAVX > 0) &&
5174             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5175   match(Set dst (SubVF src (LoadVector mem)));
5176   format %{ "vsubps  $dst,$src,$mem\t! sub packedF" %}
5177   ins_encode %{
5178     int vlen_enc = vector_length_encoding(this);
5179     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5180   %}
5181   ins_pipe( pipe_slow );
5182 %}
5183 
5184 // Doubles vector sub
5185 instruct vsubD(vec dst, vec src) %{
5186   predicate(UseAVX == 0);
5187   match(Set dst (SubVD dst src));
5188   format %{ "subpd   $dst,$src\t! sub packedD" %}
5189   ins_encode %{
5190     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
5191   %}
5192   ins_pipe( pipe_slow );
5193 %}
5194 
5195 instruct vsubD_reg(vec dst, vec src1, vec src2) %{
5196   predicate(UseAVX > 0);
5197   match(Set dst (SubVD src1 src2));
5198   format %{ "vsubpd  $dst,$src1,$src2\t! sub packedD" %}
5199   ins_encode %{
5200     int vlen_enc = vector_length_encoding(this);
5201     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5202   %}
5203   ins_pipe( pipe_slow );
5204 %}
5205 
5206 instruct vsubD_mem(vec dst, vec src, memory mem) %{
5207   predicate((UseAVX > 0) &&
5208             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5209   match(Set dst (SubVD src (LoadVector mem)));
5210   format %{ "vsubpd  $dst,$src,$mem\t! sub packedD" %}
5211   ins_encode %{
5212     int vlen_enc = vector_length_encoding(this);
5213     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5214   %}
5215   ins_pipe( pipe_slow );
5216 %}
5217 
5218 // --------------------------------- MUL --------------------------------------
5219 
5220 // Byte vector mul
5221 instruct mulB_reg(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5222   predicate(Matcher::vector_length(n) == 4 ||
5223             Matcher::vector_length(n) == 8);
5224   match(Set dst (MulVB src1 src2));
5225   effect(TEMP dst, TEMP tmp, TEMP scratch);
5226   format %{"vector_mulB $dst,$src1,$src2" %}
5227   ins_encode %{
5228     assert(UseSSE > 3, "required");
5229     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
5230     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
5231     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
5232     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5233     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5234     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5235   %}
5236   ins_pipe( pipe_slow );
5237 %}
5238 
5239 instruct mul16B_reg(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5240   predicate(Matcher::vector_length(n) == 16 && UseAVX <= 1);
5241   match(Set dst (MulVB src1 src2));
5242   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5243   format %{"vector_mulB $dst,$src1,$src2" %}
5244   ins_encode %{
5245     assert(UseSSE > 3, "required");
5246     __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
5247     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
5248     __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
5249     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
5250     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
5251     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
5252     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
5253     __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
5254     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5255     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5256     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5257     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5258   %}
5259   ins_pipe( pipe_slow );
5260 %}
5261 
5262 instruct vmul16B_reg_avx(vec dst, vec src1, vec src2, vec tmp, rRegI scratch) %{
5263   predicate(Matcher::vector_length(n) == 16 && UseAVX > 1);
5264   match(Set dst (MulVB src1 src2));
5265   effect(TEMP dst, TEMP tmp, TEMP scratch);
5266   format %{"vector_mulB $dst,$src1,$src2" %}
5267   ins_encode %{
5268   int vlen_enc = Assembler::AVX_256bit;
5269     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5270     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5271     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5272     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5273     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5274     __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
5275     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
5276   %}
5277   ins_pipe( pipe_slow );
5278 %}
5279 
5280 instruct vmul32B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5281   predicate(Matcher::vector_length(n) == 32);
5282   match(Set dst (MulVB src1 src2));
5283   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5284   format %{"vector_mulB $dst,$src1,$src2" %}
5285   ins_encode %{
5286     assert(UseAVX > 1, "required");
5287     int vlen_enc = Assembler::AVX_256bit;
5288     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5289     __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
5290     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5291     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5292     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5293     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5294     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5295     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5296     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5297     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5298     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5299     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5300     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5301     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
5302   %}
5303   ins_pipe( pipe_slow );
5304 %}
5305 
5306 instruct vmul64B_reg_avx(vec dst, vec src1, vec src2, vec tmp1, vec tmp2, rRegI scratch) %{
5307   predicate(Matcher::vector_length(n) == 64);
5308   match(Set dst (MulVB src1 src2));
5309   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
5310   format %{"vector_mulB $dst,$src1,$src2\n\t" %}
5311   ins_encode %{
5312     assert(UseAVX > 2, "required");
5313     int vlen_enc = Assembler::AVX_512bit;
5314     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
5315     __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
5316     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5317     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5318     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5319     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vlen_enc);
5320     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5321     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5322     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5323     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5324     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5325     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5326     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
5327     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
5328     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5329   %}
5330   ins_pipe( pipe_slow );
5331 %}
5332 
5333 // Shorts/Chars vector mul
5334 instruct vmulS(vec dst, vec src) %{
5335   predicate(UseAVX == 0);
5336   match(Set dst (MulVS dst src));
5337   format %{ "pmullw $dst,$src\t! mul packedS" %}
5338   ins_encode %{
5339     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
5340   %}
5341   ins_pipe( pipe_slow );
5342 %}
5343 
5344 instruct vmulS_reg(vec dst, vec src1, vec src2) %{
5345   predicate(UseAVX > 0);
5346   match(Set dst (MulVS src1 src2));
5347   format %{ "vpmullw $dst,$src1,$src2\t! mul packedS" %}
5348   ins_encode %{
5349     int vlen_enc = vector_length_encoding(this);
5350     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5351   %}
5352   ins_pipe( pipe_slow );
5353 %}
5354 
5355 instruct vmulS_mem(vec dst, vec src, memory mem) %{
5356   predicate((UseAVX > 0) &&
5357             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5358   match(Set dst (MulVS src (LoadVector mem)));
5359   format %{ "vpmullw $dst,$src,$mem\t! mul packedS" %}
5360   ins_encode %{
5361     int vlen_enc = vector_length_encoding(this);
5362     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5363   %}
5364   ins_pipe( pipe_slow );
5365 %}
5366 
5367 // Integers vector mul
5368 instruct vmulI(vec dst, vec src) %{
5369   predicate(UseAVX == 0);
5370   match(Set dst (MulVI dst src));
5371   format %{ "pmulld  $dst,$src\t! mul packedI" %}
5372   ins_encode %{
5373     assert(UseSSE > 3, "required");
5374     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
5375   %}
5376   ins_pipe( pipe_slow );
5377 %}
5378 
5379 instruct vmulI_reg(vec dst, vec src1, vec src2) %{
5380   predicate(UseAVX > 0);
5381   match(Set dst (MulVI src1 src2));
5382   format %{ "vpmulld $dst,$src1,$src2\t! mul packedI" %}
5383   ins_encode %{
5384     int vlen_enc = vector_length_encoding(this);
5385     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5386   %}
5387   ins_pipe( pipe_slow );
5388 %}
5389 
5390 instruct vmulI_mem(vec dst, vec src, memory mem) %{
5391   predicate((UseAVX > 0) &&
5392             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5393   match(Set dst (MulVI src (LoadVector mem)));
5394   format %{ "vpmulld $dst,$src,$mem\t! mul packedI" %}
5395   ins_encode %{
5396     int vlen_enc = vector_length_encoding(this);
5397     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5398   %}
5399   ins_pipe( pipe_slow );
5400 %}
5401 
5402 // Longs vector mul
5403 instruct vmulL_reg(vec dst, vec src1, vec src2) %{
5404   predicate(VM_Version::supports_avx512dq());
5405   match(Set dst (MulVL src1 src2));
5406   format %{ "vpmullq $dst,$src1,$src2\t! mul packedL" %}
5407   ins_encode %{
5408     assert(UseAVX > 2, "required");
5409     int vlen_enc = vector_length_encoding(this);
5410     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5411   %}
5412   ins_pipe( pipe_slow );
5413 %}
5414 
5415 instruct vmulL_mem(vec dst, vec src, memory mem) %{
5416   predicate(VM_Version::supports_avx512dq() &&
5417               (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5418   match(Set dst (MulVL src (LoadVector mem)));
5419   format %{ "vpmullq $dst,$src,$mem\t! mul packedL" %}
5420   ins_encode %{
5421     assert(UseAVX > 2, "required");
5422     int vlen_enc = vector_length_encoding(this);
5423     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5424   %}
5425   ins_pipe( pipe_slow );
5426 %}
5427 
5428 instruct mul2L_reg(vec dst, vec src2, legVec tmp) %{
5429   predicate(Matcher::vector_length(n) == 2 && !VM_Version::supports_avx512dq());
5430   match(Set dst (MulVL dst src2));
5431   effect(TEMP dst, TEMP tmp);
5432   format %{ "pshufd $tmp,$src2, 177\n\t"
5433             "pmulld $tmp,$dst\n\t"
5434             "phaddd $tmp,$tmp\n\t"
5435             "pmovzxdq $tmp,$tmp\n\t"
5436             "psllq $tmp, 32\n\t"
5437             "pmuludq $dst,$src2\n\t"
5438             "paddq $dst,$tmp\n\t! mul packed2L" %}
5439 
5440   ins_encode %{
5441     assert(VM_Version::supports_sse4_1(), "required");
5442     int vlen_enc = Assembler::AVX_128bit;
5443     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177);
5444     __ pmulld($tmp$$XMMRegister, $dst$$XMMRegister);
5445     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
5446     __ pmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister);
5447     __ psllq($tmp$$XMMRegister, 32);
5448     __ pmuludq($dst$$XMMRegister, $src2$$XMMRegister);
5449     __ paddq($dst$$XMMRegister, $tmp$$XMMRegister);
5450   %}
5451   ins_pipe( pipe_slow );
5452 %}
5453 
5454 instruct vmul4L_reg_avx(vec dst, vec src1, vec src2, legVec tmp, legVec tmp1) %{
5455   predicate(Matcher::vector_length(n) == 4 && !VM_Version::supports_avx512dq());
5456   match(Set dst (MulVL src1 src2));
5457   effect(TEMP tmp1, TEMP tmp);
5458   format %{ "vpshufd $tmp,$src2\n\t"
5459             "vpmulld $tmp,$src1,$tmp\n\t"
5460             "vphaddd $tmp,$tmp,$tmp\n\t"
5461             "vpmovzxdq $tmp,$tmp\n\t"
5462             "vpsllq $tmp,$tmp\n\t"
5463             "vpmuludq $tmp1,$src1,$src2\n\t"
5464             "vpaddq $dst,$tmp,$tmp1\t! mul packed4L" %}
5465   ins_encode %{
5466     int vlen_enc = Assembler::AVX_256bit;
5467     __ vpshufd($tmp$$XMMRegister, $src2$$XMMRegister, 177, vlen_enc);
5468     __ vpmulld($tmp$$XMMRegister, $src1$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5469     __ vextracti128_high($tmp1$$XMMRegister, $tmp$$XMMRegister);
5470     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5471     __ vpmovzxdq($tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
5472     __ vpsllq($tmp$$XMMRegister, $tmp$$XMMRegister, 32, vlen_enc);
5473     __ vpmuludq($tmp1$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5474     __ vpaddq($dst$$XMMRegister, $tmp$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
5475   %}
5476   ins_pipe( pipe_slow );
5477 %}
5478 
5479 // Floats vector mul
5480 instruct vmulF(vec dst, vec src) %{
5481   predicate(UseAVX == 0);
5482   match(Set dst (MulVF dst src));
5483   format %{ "mulps   $dst,$src\t! mul packedF" %}
5484   ins_encode %{
5485     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
5486   %}
5487   ins_pipe( pipe_slow );
5488 %}
5489 
5490 instruct vmulF_reg(vec dst, vec src1, vec src2) %{
5491   predicate(UseAVX > 0);
5492   match(Set dst (MulVF src1 src2));
5493   format %{ "vmulps  $dst,$src1,$src2\t! mul packedF" %}
5494   ins_encode %{
5495     int vlen_enc = vector_length_encoding(this);
5496     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5497   %}
5498   ins_pipe( pipe_slow );
5499 %}
5500 
5501 instruct vmulF_mem(vec dst, vec src, memory mem) %{
5502   predicate((UseAVX > 0) &&
5503             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5504   match(Set dst (MulVF src (LoadVector mem)));
5505   format %{ "vmulps  $dst,$src,$mem\t! mul packedF" %}
5506   ins_encode %{
5507     int vlen_enc = vector_length_encoding(this);
5508     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5509   %}
5510   ins_pipe( pipe_slow );
5511 %}
5512 
5513 // Doubles vector mul
5514 instruct vmulD(vec dst, vec src) %{
5515   predicate(UseAVX == 0);
5516   match(Set dst (MulVD dst src));
5517   format %{ "mulpd   $dst,$src\t! mul packedD" %}
5518   ins_encode %{
5519     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
5520   %}
5521   ins_pipe( pipe_slow );
5522 %}
5523 
5524 instruct vmulD_reg(vec dst, vec src1, vec src2) %{
5525   predicate(UseAVX > 0);
5526   match(Set dst (MulVD src1 src2));
5527   format %{ "vmulpd  $dst,$src1,$src2\t! mul packedD" %}
5528   ins_encode %{
5529     int vlen_enc = vector_length_encoding(this);
5530     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5531   %}
5532   ins_pipe( pipe_slow );
5533 %}
5534 
5535 instruct vmulD_mem(vec dst, vec src, memory mem) %{
5536   predicate((UseAVX > 0) &&
5537             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5538   match(Set dst (MulVD src (LoadVector mem)));
5539   format %{ "vmulpd  $dst,$src,$mem\t! mul packedD" %}
5540   ins_encode %{
5541     int vlen_enc = vector_length_encoding(this);
5542     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5543   %}
5544   ins_pipe( pipe_slow );
5545 %}
5546 
5547 instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5548   predicate(Matcher::vector_length(n) == 8);
5549   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
5550   effect(TEMP dst, USE src1, USE src2);
5551   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
5552             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
5553          %}
5554   ins_encode %{
5555     assert(UseAVX > 0, "required");
5556 
5557     int vlen_enc = Assembler::AVX_256bit;
5558     int cond = (Assembler::Condition)($copnd$$cmpcode);
5559     __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5560     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5561   %}
5562   ins_pipe( pipe_slow );
5563 %}
5564 
5565 instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
5566   predicate(Matcher::vector_length(n) == 4);
5567   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
5568   effect(TEMP dst, USE src1, USE src2);
5569   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
5570             "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
5571          %}
5572   ins_encode %{
5573     assert(UseAVX > 0, "required");
5574 
5575     int vlen_enc = Assembler::AVX_256bit;
5576     int cond = (Assembler::Condition)($copnd$$cmpcode);
5577     __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
5578     __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
5579   %}
5580   ins_pipe( pipe_slow );
5581 %}
5582 
5583 // --------------------------------- DIV --------------------------------------
5584 
5585 // Floats vector div
5586 instruct vdivF(vec dst, vec src) %{
5587   predicate(UseAVX == 0);
5588   match(Set dst (DivVF dst src));
5589   format %{ "divps   $dst,$src\t! div packedF" %}
5590   ins_encode %{
5591     __ divps($dst$$XMMRegister, $src$$XMMRegister);
5592   %}
5593   ins_pipe( pipe_slow );
5594 %}
5595 
5596 instruct vdivF_reg(vec dst, vec src1, vec src2) %{
5597   predicate(UseAVX > 0);
5598   match(Set dst (DivVF src1 src2));
5599   format %{ "vdivps  $dst,$src1,$src2\t! div packedF" %}
5600   ins_encode %{
5601     int vlen_enc = vector_length_encoding(this);
5602     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5603   %}
5604   ins_pipe( pipe_slow );
5605 %}
5606 
5607 instruct vdivF_mem(vec dst, vec src, memory mem) %{
5608   predicate((UseAVX > 0) &&
5609             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5610   match(Set dst (DivVF src (LoadVector mem)));
5611   format %{ "vdivps  $dst,$src,$mem\t! div packedF" %}
5612   ins_encode %{
5613     int vlen_enc = vector_length_encoding(this);
5614     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5615   %}
5616   ins_pipe( pipe_slow );
5617 %}
5618 
5619 // Doubles vector div
5620 instruct vdivD(vec dst, vec src) %{
5621   predicate(UseAVX == 0);
5622   match(Set dst (DivVD dst src));
5623   format %{ "divpd   $dst,$src\t! div packedD" %}
5624   ins_encode %{
5625     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
5626   %}
5627   ins_pipe( pipe_slow );
5628 %}
5629 
5630 instruct vdivD_reg(vec dst, vec src1, vec src2) %{
5631   predicate(UseAVX > 0);
5632   match(Set dst (DivVD src1 src2));
5633   format %{ "vdivpd  $dst,$src1,$src2\t! div packedD" %}
5634   ins_encode %{
5635     int vlen_enc = vector_length_encoding(this);
5636     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5637   %}
5638   ins_pipe( pipe_slow );
5639 %}
5640 
5641 instruct vdivD_mem(vec dst, vec src, memory mem) %{
5642   predicate((UseAVX > 0) &&
5643             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
5644   match(Set dst (DivVD src (LoadVector mem)));
5645   format %{ "vdivpd  $dst,$src,$mem\t! div packedD" %}
5646   ins_encode %{
5647     int vlen_enc = vector_length_encoding(this);
5648     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
5649   %}
5650   ins_pipe( pipe_slow );
5651 %}
5652 
5653 // ------------------------------ MinMax ---------------------------------------
5654 
5655 // Byte, Short, Int vector Min/Max
5656 instruct minmax_reg_sse(vec dst, vec src) %{
5657   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5658             UseAVX == 0);
5659   match(Set dst (MinV dst src));
5660   match(Set dst (MaxV dst src));
5661   format %{ "vector_minmax  $dst,$src\t!  " %}
5662   ins_encode %{
5663     assert(UseSSE >= 4, "required");
5664 
5665     int opcode = this->ideal_Opcode();
5666     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5667     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister);
5668   %}
5669   ins_pipe( pipe_slow );
5670 %}
5671 
5672 instruct vminmax_reg(vec dst, vec src1, vec src2) %{
5673   predicate(is_integral_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_element_basic_type(n) != T_LONG && // T_BYTE, T_SHORT, T_INT
5674             UseAVX > 0);
5675   match(Set dst (MinV src1 src2));
5676   match(Set dst (MaxV src1 src2));
5677   format %{ "vector_minmax  $dst,$src1,$src2\t!  " %}
5678   ins_encode %{
5679     int opcode = this->ideal_Opcode();
5680     int vlen_enc = vector_length_encoding(this);
5681     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5682 
5683     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5684   %}
5685   ins_pipe( pipe_slow );
5686 %}
5687 
5688 // Long vector Min/Max
5689 instruct minmaxL_reg_sse(vec dst, vec src, rxmm0 tmp) %{
5690   predicate(Matcher::vector_length_in_bytes(n) == 16 && Matcher::vector_element_basic_type(n) == T_LONG &&
5691             UseAVX == 0);
5692   match(Set dst (MinV dst src));
5693   match(Set dst (MaxV src dst));
5694   effect(TEMP dst, TEMP tmp);
5695   format %{ "vector_minmaxL  $dst,$src\t!using $tmp as TEMP" %}
5696   ins_encode %{
5697     assert(UseSSE >= 4, "required");
5698 
5699     int opcode = this->ideal_Opcode();
5700     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5701     assert(elem_bt == T_LONG, "sanity");
5702 
5703     __ pminmax(opcode, elem_bt, $dst$$XMMRegister, $src$$XMMRegister, $tmp$$XMMRegister);
5704   %}
5705   ins_pipe( pipe_slow );
5706 %}
5707 
5708 instruct vminmaxL_reg_avx(legVec dst, legVec src1, legVec src2) %{
5709   predicate(Matcher::vector_length_in_bytes(n) <= 32 && Matcher::vector_element_basic_type(n) == T_LONG &&
5710             UseAVX > 0 && !VM_Version::supports_avx512vl());
5711   match(Set dst (MinV src1 src2));
5712   match(Set dst (MaxV src1 src2));
5713   effect(TEMP dst);
5714   format %{ "vector_minmaxL  $dst,$src1,$src2\t! " %}
5715   ins_encode %{
5716     int vlen_enc = vector_length_encoding(this);
5717     int opcode = this->ideal_Opcode();
5718     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5719     assert(elem_bt == T_LONG, "sanity");
5720 
5721     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5722   %}
5723   ins_pipe( pipe_slow );
5724 %}
5725 
5726 instruct vminmaxL_reg_evex(vec dst, vec src1, vec src2) %{
5727   predicate((Matcher::vector_length_in_bytes(n) == 64 || VM_Version::supports_avx512vl()) &&
5728             Matcher::vector_element_basic_type(n) == T_LONG);
5729   match(Set dst (MinV src1 src2));
5730   match(Set dst (MaxV src1 src2));
5731   format %{ "vector_minmaxL  $dst,$src1,src2\t! " %}
5732   ins_encode %{
5733     assert(UseAVX > 2, "required");
5734 
5735     int vlen_enc = vector_length_encoding(this);
5736     int opcode = this->ideal_Opcode();
5737     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5738     assert(elem_bt == T_LONG, "sanity");
5739 
5740     __ vpminmax(opcode, elem_bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
5741   %}
5742   ins_pipe( pipe_slow );
5743 %}
5744 
5745 // Float/Double vector Min/Max
5746 instruct minmaxFP_reg(legVec dst, legVec a, legVec b, legVec tmp, legVec atmp, legVec btmp) %{
5747   predicate(Matcher::vector_length_in_bytes(n) <= 32 &&
5748             is_floating_point_type(Matcher::vector_element_basic_type(n)) && // T_FLOAT, T_DOUBLE
5749             UseAVX > 0);
5750   match(Set dst (MinV a b));
5751   match(Set dst (MaxV a b));
5752   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
5753   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $tmp, $atmp, $btmp as TEMP" %}
5754   ins_encode %{
5755     assert(UseAVX > 0, "required");
5756 
5757     int opcode = this->ideal_Opcode();
5758     int vlen_enc = vector_length_encoding(this);
5759     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5760 
5761     __ vminmax_fp(opcode, elem_bt,
5762                   $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
5763                   $tmp$$XMMRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
5764   %}
5765   ins_pipe( pipe_slow );
5766 %}
5767 
5768 instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktmp) %{
5769   predicate(Matcher::vector_length_in_bytes(n) == 64 &&
5770             is_floating_point_type(Matcher::vector_element_basic_type(n))); // T_FLOAT, T_DOUBLE
5771   match(Set dst (MinV a b));
5772   match(Set dst (MaxV a b));
5773   effect(TEMP dst, USE a, USE b, TEMP atmp, TEMP btmp, TEMP ktmp);
5774   format %{ "vector_minmaxFP  $dst,$a,$b\t!using $atmp, $btmp as TEMP" %}
5775   ins_encode %{
5776     assert(UseAVX > 2, "required");
5777 
5778     int opcode = this->ideal_Opcode();
5779     int vlen_enc = vector_length_encoding(this);
5780     BasicType elem_bt = Matcher::vector_element_basic_type(this);
5781 
5782     __ evminmax_fp(opcode, elem_bt,
5783                    $dst$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister,
5784                    $ktmp$$KRegister, $atmp$$XMMRegister , $btmp$$XMMRegister, vlen_enc);
5785   %}
5786   ins_pipe( pipe_slow );
5787 %}
5788 
5789 // --------------------------------- Signum/CopySign ---------------------------
5790 
5791 instruct signumF_reg(regF dst, regF zero, regF one, rRegP scratch, rFlagsReg cr) %{
5792   match(Set dst (SignumF dst (Binary zero one)));
5793   effect(TEMP scratch, KILL cr);
5794   format %{ "signumF $dst, $dst\t! using $scratch as TEMP" %}
5795   ins_encode %{
5796     int opcode = this->ideal_Opcode();
5797     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
5798   %}
5799   ins_pipe( pipe_slow );
5800 %}
5801 
5802 instruct signumD_reg(regD dst, regD zero, regD one, rRegP scratch, rFlagsReg cr) %{
5803   match(Set dst (SignumD dst (Binary zero one)));
5804   effect(TEMP scratch, KILL cr);
5805   format %{ "signumD $dst, $dst\t! using $scratch as TEMP" %}
5806   ins_encode %{
5807     int opcode = this->ideal_Opcode();
5808     __ signum_fp(opcode, $dst$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister, $scratch$$Register);
5809   %}
5810   ins_pipe( pipe_slow );
5811 %}
5812 
5813 // ---------------------------------------
5814 // For copySign use 0xE4 as writemask for vpternlog
5815 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
5816 // C (xmm2) is set to 0x7FFFFFFF
5817 // Wherever xmm2 is 0, we want to pick from B (sign)
5818 // Wherever xmm2 is 1, we want to pick from A (src)
5819 //
5820 // A B C Result
5821 // 0 0 0 0
5822 // 0 0 1 0
5823 // 0 1 0 1
5824 // 0 1 1 0
5825 // 1 0 0 0
5826 // 1 0 1 1
5827 // 1 1 0 1
5828 // 1 1 1 1
5829 //
5830 // Result going from high bit to low bit is 0x11100100 = 0xe4
5831 // ---------------------------------------
5832 
5833 #ifdef _LP64
5834 instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
5835   match(Set dst (CopySignF dst src));
5836   effect(TEMP tmp1, TEMP tmp2);
5837   format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
5838   ins_encode %{
5839     __ movl($tmp2$$Register, 0x7FFFFFFF);
5840     __ movdl($tmp1$$XMMRegister, $tmp2$$Register);
5841     __ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
5842   %}
5843   ins_pipe( pipe_slow );
5844 %}
5845 
5846 instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
5847   match(Set dst (CopySignD dst (Binary src zero)));
5848   ins_cost(100);
5849   effect(TEMP tmp1, TEMP tmp2);
5850   format %{ "CopySignD  $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
5851   ins_encode %{
5852     __ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
5853     __ movq($tmp1$$XMMRegister, $tmp2$$Register);
5854     __ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
5855   %}
5856   ins_pipe( pipe_slow );
5857 %}
5858 #endif // _LP64
5859 
5860 // --------------------------------- Sqrt --------------------------------------
5861 
5862 instruct vsqrtF_reg(vec dst, vec src) %{
5863   match(Set dst (SqrtVF src));
5864   format %{ "vsqrtps  $dst,$src\t! sqrt packedF" %}
5865   ins_encode %{
5866     assert(UseAVX > 0, "required");
5867     int vlen_enc = vector_length_encoding(this);
5868     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5869   %}
5870   ins_pipe( pipe_slow );
5871 %}
5872 
5873 instruct vsqrtF_mem(vec dst, memory mem) %{
5874   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
5875   match(Set dst (SqrtVF (LoadVector mem)));
5876   format %{ "vsqrtps  $dst,$mem\t! sqrt packedF" %}
5877   ins_encode %{
5878     assert(UseAVX > 0, "required");
5879     int vlen_enc = vector_length_encoding(this);
5880     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vlen_enc);
5881   %}
5882   ins_pipe( pipe_slow );
5883 %}
5884 
5885 // Floating point vector sqrt
5886 instruct vsqrtD_reg(vec dst, vec src) %{
5887   match(Set dst (SqrtVD src));
5888   format %{ "vsqrtpd  $dst,$src\t! sqrt packedD" %}
5889   ins_encode %{
5890     assert(UseAVX > 0, "required");
5891     int vlen_enc = vector_length_encoding(this);
5892     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
5893   %}
5894   ins_pipe( pipe_slow );
5895 %}
5896 
5897 instruct vsqrtD_mem(vec dst, memory mem) %{
5898   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
5899   match(Set dst (SqrtVD (LoadVector mem)));
5900   format %{ "vsqrtpd  $dst,$mem\t! sqrt packedD" %}
5901   ins_encode %{
5902     assert(UseAVX > 0, "required");
5903     int vlen_enc = vector_length_encoding(this);
5904     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vlen_enc);
5905   %}
5906   ins_pipe( pipe_slow );
5907 %}
5908 
5909 // ------------------------------ Shift ---------------------------------------
5910 
5911 // Left and right shift count vectors are the same on x86
5912 // (only lowest bits of xmm reg are used for count).
5913 instruct vshiftcnt(vec dst, rRegI cnt) %{
5914   match(Set dst (LShiftCntV cnt));
5915   match(Set dst (RShiftCntV cnt));
5916   format %{ "movdl    $dst,$cnt\t! load shift count" %}
5917   ins_encode %{
5918     __ movdl($dst$$XMMRegister, $cnt$$Register);
5919   %}
5920   ins_pipe( pipe_slow );
5921 %}
5922 
5923 // Byte vector shift
5924 instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5925   predicate(Matcher::vector_length(n) <= 8 && VectorNode::is_vshift_cnt(n->in(2)));
5926   match(Set dst ( LShiftVB src shift));
5927   match(Set dst ( RShiftVB src shift));
5928   match(Set dst (URShiftVB src shift));
5929   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
5930   format %{"vector_byte_shift $dst,$src,$shift" %}
5931   ins_encode %{
5932     assert(UseSSE > 3, "required");
5933     int opcode = this->ideal_Opcode();
5934     bool sign = (opcode != Op_URShiftVB);
5935     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
5936     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
5937     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5938     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
5939     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
5940   %}
5941   ins_pipe( pipe_slow );
5942 %}
5943 
5944 instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
5945   predicate(Matcher::vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) &&
5946             UseAVX <= 1);
5947   match(Set dst ( LShiftVB src shift));
5948   match(Set dst ( RShiftVB src shift));
5949   match(Set dst (URShiftVB src shift));
5950   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
5951   format %{"vector_byte_shift $dst,$src,$shift" %}
5952   ins_encode %{
5953     assert(UseSSE > 3, "required");
5954     int opcode = this->ideal_Opcode();
5955     bool sign = (opcode != Op_URShiftVB);
5956     __ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
5957     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
5958     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
5959     __ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
5960     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
5961     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
5962     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
5963     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
5964     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
5965   %}
5966   ins_pipe( pipe_slow );
5967 %}
5968 
5969 instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5970   predicate(Matcher::vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) &&
5971             UseAVX > 1);
5972   match(Set dst ( LShiftVB src shift));
5973   match(Set dst ( RShiftVB src shift));
5974   match(Set dst (URShiftVB src shift));
5975   effect(TEMP dst, TEMP tmp, TEMP scratch);
5976   format %{"vector_byte_shift $dst,$src,$shift" %}
5977   ins_encode %{
5978     int opcode = this->ideal_Opcode();
5979     bool sign = (opcode != Op_URShiftVB);
5980     int vlen_enc = Assembler::AVX_256bit;
5981     __ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
5982     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
5983     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
5984     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
5985     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
5986   %}
5987   ins_pipe( pipe_slow );
5988 %}
5989 
5990 instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
5991   predicate(Matcher::vector_length(n) == 32 && VectorNode::is_vshift_cnt(n->in(2)));
5992   match(Set dst ( LShiftVB src shift));
5993   match(Set dst ( RShiftVB src shift));
5994   match(Set dst (URShiftVB src shift));
5995   effect(TEMP dst, TEMP tmp, TEMP scratch);
5996   format %{"vector_byte_shift $dst,$src,$shift" %}
5997   ins_encode %{
5998     assert(UseAVX > 1, "required");
5999     int opcode = this->ideal_Opcode();
6000     bool sign = (opcode != Op_URShiftVB);
6001     int vlen_enc = Assembler::AVX_256bit;
6002     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
6003     __ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6004     __ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6005     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6006     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6007     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6008     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6009     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6010     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6011   %}
6012   ins_pipe( pipe_slow );
6013 %}
6014 
6015 instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
6016   predicate(Matcher::vector_length(n) == 64 && VectorNode::is_vshift_cnt(n->in(2)));
6017   match(Set dst ( LShiftVB src shift));
6018   match(Set dst  (RShiftVB src shift));
6019   match(Set dst (URShiftVB src shift));
6020   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
6021   format %{"vector_byte_shift $dst,$src,$shift" %}
6022   ins_encode %{
6023     assert(UseAVX > 2, "required");
6024     int opcode = this->ideal_Opcode();
6025     bool sign = (opcode != Op_URShiftVB);
6026     int vlen_enc = Assembler::AVX_512bit;
6027     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
6028     __ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
6029     __ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6030     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6031     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6032     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
6033     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6034     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6035     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6036     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
6037     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
6038     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6039   %}
6040   ins_pipe( pipe_slow );
6041 %}
6042 
6043 // Shorts vector logical right shift produces incorrect Java result
6044 // for negative data because java code convert short value into int with
6045 // sign extension before a shift. But char vectors are fine since chars are
6046 // unsigned values.
6047 // Shorts/Chars vector left shift
6048 instruct vshiftS(vec dst, vec src, vec shift) %{
6049   predicate(VectorNode::is_vshift_cnt(n->in(2)));
6050   match(Set dst ( LShiftVS src shift));
6051   match(Set dst ( RShiftVS src shift));
6052   match(Set dst (URShiftVS src shift));
6053   effect(TEMP dst, USE src, USE shift);
6054   format %{ "vshiftw  $dst,$src,$shift\t! shift packedS" %}
6055   ins_encode %{
6056     int opcode = this->ideal_Opcode();
6057     if (UseAVX > 0) {
6058       int vlen_enc = vector_length_encoding(this);
6059       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6060     } else {
6061       int vlen = Matcher::vector_length(this);
6062       if (vlen == 2) {
6063         __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6064         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6065       } else if (vlen == 4) {
6066         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6067         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6068       } else {
6069         assert (vlen == 8, "sanity");
6070         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6071         __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6072       }
6073     }
6074   %}
6075   ins_pipe( pipe_slow );
6076 %}
6077 
6078 // Integers vector left shift
6079 instruct vshiftI(vec dst, vec src, vec shift) %{
6080   predicate(VectorNode::is_vshift_cnt(n->in(2)));
6081   match(Set dst ( LShiftVI src shift));
6082   match(Set dst ( RShiftVI src shift));
6083   match(Set dst (URShiftVI src shift));
6084   effect(TEMP dst, USE src, USE shift);
6085   format %{ "vshiftd  $dst,$src,$shift\t! shift packedI" %}
6086   ins_encode %{
6087     int opcode = this->ideal_Opcode();
6088     if (UseAVX > 0) {
6089       int vlen_enc = vector_length_encoding(this);
6090       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6091     } else {
6092       int vlen = Matcher::vector_length(this);
6093       if (vlen == 2) {
6094         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6095         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6096       } else {
6097         assert(vlen == 4, "sanity");
6098         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6099         __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6100       }
6101     }
6102   %}
6103   ins_pipe( pipe_slow );
6104 %}
6105 
6106 // Integers vector left constant shift
6107 instruct vshiftI_imm(vec dst, vec src, immI8 shift) %{
6108   match(Set dst (LShiftVI src (LShiftCntV shift)));
6109   match(Set dst (RShiftVI src (RShiftCntV shift)));
6110   match(Set dst (URShiftVI src (RShiftCntV shift)));
6111   format %{ "vshiftd_imm  $dst,$src,$shift\t! shift packedI" %}
6112   ins_encode %{
6113     int opcode = this->ideal_Opcode();
6114     if (UseAVX > 0) {
6115       int vector_len = vector_length_encoding(this);
6116       __ vshiftd_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6117     } else {
6118       int vlen = Matcher::vector_length(this);
6119       if (vlen == 2) {
6120         __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
6121         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6122       } else {
6123         assert(vlen == 4, "sanity");
6124         __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6125         __ vshiftd_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6126       }
6127     }
6128   %}
6129   ins_pipe( pipe_slow );
6130 %}
6131 
6132 // Longs vector shift
6133 instruct vshiftL(vec dst, vec src, vec shift) %{
6134   predicate(VectorNode::is_vshift_cnt(n->in(2)));
6135   match(Set dst ( LShiftVL src shift));
6136   match(Set dst (URShiftVL src shift));
6137   effect(TEMP dst, USE src, USE shift);
6138   format %{ "vshiftq  $dst,$src,$shift\t! shift packedL" %}
6139   ins_encode %{
6140     int opcode = this->ideal_Opcode();
6141     if (UseAVX > 0) {
6142       int vlen_enc = vector_length_encoding(this);
6143       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6144     } else {
6145       assert(Matcher::vector_length(this) == 2, "");
6146       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6147       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
6148     }
6149   %}
6150   ins_pipe( pipe_slow );
6151 %}
6152 
6153 // Longs vector constant shift
6154 instruct vshiftL_imm(vec dst, vec src, immI8 shift) %{
6155   match(Set dst (LShiftVL src (LShiftCntV shift)));
6156   match(Set dst (URShiftVL src (RShiftCntV shift)));
6157   format %{ "vshiftq_imm  $dst,$src,$shift\t! shift packedL" %}
6158   ins_encode %{
6159     int opcode = this->ideal_Opcode();
6160     if (UseAVX > 0) {
6161       int vector_len = vector_length_encoding(this);
6162       __ vshiftq_imm(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
6163     } else {
6164       assert(Matcher::vector_length(this) == 2, "");
6165       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6166       __ vshiftq_imm(opcode, $dst$$XMMRegister, $shift$$constant);
6167     }
6168   %}
6169   ins_pipe( pipe_slow );
6170 %}
6171 
6172 // -------------------ArithmeticRightShift -----------------------------------
6173 // Long vector arithmetic right shift
6174 instruct vshiftL_arith_reg(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
6175   predicate(VectorNode::is_vshift_cnt(n->in(2)) && UseAVX <= 2);
6176   match(Set dst (RShiftVL src shift));
6177   effect(TEMP dst, TEMP tmp, TEMP scratch);
6178   format %{ "vshiftq $dst,$src,$shift" %}
6179   ins_encode %{
6180     uint vlen = Matcher::vector_length(this);
6181     if (vlen == 2) {
6182       assert(UseSSE >= 2, "required");
6183       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
6184       __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
6185       __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6186       __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
6187       __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
6188       __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
6189     } else {
6190       assert(vlen == 4, "sanity");
6191       assert(UseAVX > 1, "required");
6192       int vlen_enc = Assembler::AVX_256bit;
6193       __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6194       __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
6195       __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6196       __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6197       __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
6198     }
6199   %}
6200   ins_pipe( pipe_slow );
6201 %}
6202 
6203 instruct vshiftL_arith_reg_evex(vec dst, vec src, vec shift) %{
6204   predicate(VectorNode::is_vshift_cnt(n->in(2)) && UseAVX > 2);
6205   match(Set dst (RShiftVL src shift));
6206   format %{ "vshiftq $dst,$src,$shift" %}
6207   ins_encode %{
6208     int vlen_enc = vector_length_encoding(this);
6209     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6210   %}
6211   ins_pipe( pipe_slow );
6212 %}
6213 
6214 // ------------------- Variable Shift -----------------------------
6215 // Byte variable shift
6216 instruct vshift8B_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6217   predicate(Matcher::vector_length(n) <= 8 &&
6218             !VectorNode::is_vshift_cnt(n->in(2)) &&
6219             !VM_Version::supports_avx512bw());
6220   match(Set dst ( LShiftVB src shift));
6221   match(Set dst ( RShiftVB src shift));
6222   match(Set dst (URShiftVB src shift));
6223   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6224   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6225   ins_encode %{
6226     assert(UseAVX >= 2, "required");
6227 
6228     int opcode = this->ideal_Opcode();
6229     int vlen_enc = Assembler::AVX_128bit;
6230     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6231     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6232   %}
6233   ins_pipe( pipe_slow );
6234 %}
6235 
6236 instruct vshift16B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6237   predicate(Matcher::vector_length(n) == 16 &&
6238             !VectorNode::is_vshift_cnt(n->in(2)) &&
6239             !VM_Version::supports_avx512bw());
6240   match(Set dst ( LShiftVB src shift));
6241   match(Set dst ( RShiftVB src shift));
6242   match(Set dst (URShiftVB src shift));
6243   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6244   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6245   ins_encode %{
6246     assert(UseAVX >= 2, "required");
6247 
6248     int opcode = this->ideal_Opcode();
6249     int vlen_enc = Assembler::AVX_128bit;
6250     // Shift lower half and get word result in dst
6251     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6252 
6253     // Shift upper half and get word result in vtmp1
6254     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6255     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6256     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6257 
6258     // Merge and down convert the two word results to byte in dst
6259     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6260   %}
6261   ins_pipe( pipe_slow );
6262 %}
6263 
6264 instruct vshift32B_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, vec vtmp3, vec vtmp4, rRegP scratch) %{
6265   predicate(Matcher::vector_length(n) == 32 &&
6266             !VectorNode::is_vshift_cnt(n->in(2)) &&
6267             !VM_Version::supports_avx512bw());
6268   match(Set dst ( LShiftVB src shift));
6269   match(Set dst ( RShiftVB src shift));
6270   match(Set dst (URShiftVB src shift));
6271   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP scratch);
6272   format %{ "vector_varshift_byte $dst, $src, $shift\n\t using $vtmp1, $vtmp2, $vtmp3, $vtmp4 and $scratch as TEMP" %}
6273   ins_encode %{
6274     assert(UseAVX >= 2, "required");
6275 
6276     int opcode = this->ideal_Opcode();
6277     int vlen_enc = Assembler::AVX_128bit;
6278     // Process lower 128 bits and get result in dst
6279     __ varshiftbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6280     __ vpshufd($vtmp1$$XMMRegister, $src$$XMMRegister, 0xE, 0);
6281     __ vpshufd($vtmp2$$XMMRegister, $shift$$XMMRegister, 0xE, 0);
6282     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6283     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0);
6284 
6285     // Process higher 128 bits and get result in vtmp3
6286     __ vextracti128_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6287     __ vextracti128_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6288     __ varshiftbw(opcode, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp4$$XMMRegister, $scratch$$Register);
6289     __ vpshufd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, 0xE, 0);
6290     __ vpshufd($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, 0xE, 0);
6291     __ varshiftbw(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6292     __ vpackuswb($vtmp1$$XMMRegister, $vtmp3$$XMMRegister, $vtmp1$$XMMRegister, 0);
6293 
6294     // Merge the two results in dst
6295     __ vinserti128($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6296   %}
6297   ins_pipe( pipe_slow );
6298 %}
6299 
6300 instruct vshiftB_var_evex_bw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6301   predicate(Matcher::vector_length(n) <= 32 &&
6302             !VectorNode::is_vshift_cnt(n->in(2)) &&
6303             VM_Version::supports_avx512bw());
6304   match(Set dst ( LShiftVB src shift));
6305   match(Set dst ( RShiftVB src shift));
6306   match(Set dst (URShiftVB src shift));
6307   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6308   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp, $scratch as TEMP" %}
6309   ins_encode %{
6310     assert(UseAVX > 2, "required");
6311 
6312     int opcode = this->ideal_Opcode();
6313     int vlen_enc = vector_length_encoding(this);
6314     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp$$XMMRegister, $scratch$$Register);
6315   %}
6316   ins_pipe( pipe_slow );
6317 %}
6318 
6319 instruct vshift64B_var_evex_bw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6320   predicate(Matcher::vector_length(n) == 64 &&
6321             !VectorNode::is_vshift_cnt(n->in(2)) &&
6322             VM_Version::supports_avx512bw());
6323   match(Set dst ( LShiftVB src shift));
6324   match(Set dst ( RShiftVB src shift));
6325   match(Set dst (URShiftVB src shift));
6326   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6327   format %{ "vector_varshift_byte $dst, $src, $shift\n\t! using $vtmp1, $vtmp2 and $scratch as TEMP" %}
6328   ins_encode %{
6329     assert(UseAVX > 2, "required");
6330 
6331     int opcode = this->ideal_Opcode();
6332     int vlen_enc = Assembler::AVX_256bit;
6333     __ evarshiftb(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc, $vtmp1$$XMMRegister, $scratch$$Register);
6334     __ vextracti64x4_high($vtmp1$$XMMRegister, $src$$XMMRegister);
6335     __ vextracti64x4_high($vtmp2$$XMMRegister, $shift$$XMMRegister);
6336     __ evarshiftb(opcode, $vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, vlen_enc, $vtmp2$$XMMRegister, $scratch$$Register);
6337     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, 0x1);
6338   %}
6339   ins_pipe( pipe_slow );
6340 %}
6341 
6342 // Short variable shift
6343 instruct vshift8S_var_nobw(vec dst, vec src, vec shift, vec vtmp, rRegP scratch) %{
6344   predicate(Matcher::vector_length(n) <= 8 &&
6345             !VectorNode::is_vshift_cnt(n->in(2)) &&
6346             !VM_Version::supports_avx512bw());
6347   match(Set dst ( LShiftVS src shift));
6348   match(Set dst ( RShiftVS src shift));
6349   match(Set dst (URShiftVS src shift));
6350   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6351   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6352   ins_encode %{
6353     assert(UseAVX >= 2, "required");
6354 
6355     int opcode = this->ideal_Opcode();
6356     bool sign = (opcode != Op_URShiftVS);
6357     int vlen_enc = Assembler::AVX_256bit;
6358     __ vextendwd(sign, $dst$$XMMRegister, $src$$XMMRegister, 1);
6359     __ vpmovzxwd($vtmp$$XMMRegister, $shift$$XMMRegister, 1);
6360     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
6361     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6362     __ vextracti128_high($vtmp$$XMMRegister, $dst$$XMMRegister);
6363     __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6364   %}
6365   ins_pipe( pipe_slow );
6366 %}
6367 
6368 instruct vshift16S_var_nobw(vec dst, vec src, vec shift, vec vtmp1, vec vtmp2, rRegP scratch) %{
6369   predicate(Matcher::vector_length(n) == 16 &&
6370             !VectorNode::is_vshift_cnt(n->in(2)) &&
6371             !VM_Version::supports_avx512bw());
6372   match(Set dst ( LShiftVS src shift));
6373   match(Set dst ( RShiftVS src shift));
6374   match(Set dst (URShiftVS src shift));
6375   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6376   format %{ "vector_var_shift_left_short $dst, $src, $shift\n\t" %}
6377   ins_encode %{
6378     assert(UseAVX >= 2, "required");
6379 
6380     int opcode = this->ideal_Opcode();
6381     bool sign = (opcode != Op_URShiftVS);
6382     int vlen_enc = Assembler::AVX_256bit;
6383     // Shift lower half, with result in vtmp2 usign vtmp1 as TEMP
6384     __ vextendwd(sign, $vtmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
6385     __ vpmovzxwd($vtmp1$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6386     __ varshiftd(opcode, $vtmp2$$XMMRegister, $vtmp2$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6387     __ vpand($vtmp2$$XMMRegister, $vtmp2$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6388 
6389     // Shift upper half, with result in dst usign vtmp1 as TEMP
6390     __ vextracti128_high($dst$$XMMRegister, $src$$XMMRegister);
6391     __ vextracti128_high($vtmp1$$XMMRegister, $shift$$XMMRegister);
6392     __ vextendwd(sign, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6393     __ vpmovzxwd($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6394     __ varshiftd(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, vlen_enc);
6395     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6396 
6397     // Merge lower and upper half result into dst
6398     __ vpackusdw($dst$$XMMRegister, $vtmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6399     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
6400   %}
6401   ins_pipe( pipe_slow );
6402 %}
6403 
6404 instruct vshift16S_var_evex_bw(vec dst, vec src, vec shift) %{
6405   predicate(!VectorNode::is_vshift_cnt(n->in(2)) &&
6406             VM_Version::supports_avx512bw());
6407   match(Set dst ( LShiftVS src shift));
6408   match(Set dst ( RShiftVS src shift));
6409   match(Set dst (URShiftVS src shift));
6410   format %{ "vector_varshift_short $dst,$src,$shift\t!" %}
6411   ins_encode %{
6412     assert(UseAVX > 2, "required");
6413 
6414     int opcode = this->ideal_Opcode();
6415     int vlen_enc = vector_length_encoding(this);
6416     if (!VM_Version::supports_avx512vl()) {
6417       vlen_enc = Assembler::AVX_512bit;
6418     }
6419     __ varshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6420   %}
6421   ins_pipe( pipe_slow );
6422 %}
6423 
6424 //Integer variable shift
6425 instruct vshiftI_var(vec dst, vec src, vec shift) %{
6426   predicate(!VectorNode::is_vshift_cnt(n->in(2)));
6427   match(Set dst ( LShiftVI src shift));
6428   match(Set dst ( RShiftVI src shift));
6429   match(Set dst (URShiftVI src shift));
6430   format %{ "vector_varshift_int $dst,$src,$shift\t!" %}
6431   ins_encode %{
6432     assert(UseAVX >= 2, "required");
6433 
6434     int opcode = this->ideal_Opcode();
6435     int vlen_enc = vector_length_encoding(this);
6436     __ varshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6437   %}
6438   ins_pipe( pipe_slow );
6439 %}
6440 
6441 //Long variable shift
6442 instruct vshiftL_var(vec dst, vec src, vec shift) %{
6443   predicate(!VectorNode::is_vshift_cnt(n->in(2)));
6444   match(Set dst ( LShiftVL src shift));
6445   match(Set dst (URShiftVL src shift));
6446   format %{ "vector_varshift_long $dst,$src,$shift\t!" %}
6447   ins_encode %{
6448     assert(UseAVX >= 2, "required");
6449 
6450     int opcode = this->ideal_Opcode();
6451     int vlen_enc = vector_length_encoding(this);
6452     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6453   %}
6454   ins_pipe( pipe_slow );
6455 %}
6456 
6457 //Long variable right shift arithmetic
6458 instruct vshiftL_arith_var(vec dst, vec src, vec shift, vec vtmp) %{
6459   predicate(Matcher::vector_length(n) <= 4 &&
6460             !VectorNode::is_vshift_cnt(n->in(2)) &&
6461             UseAVX == 2);
6462   match(Set dst (RShiftVL src shift));
6463   effect(TEMP dst, TEMP vtmp);
6464   format %{ "vector_varshift_long  $dst,$src,$shift\n\t! using $vtmp as TEMP" %}
6465   ins_encode %{
6466     int opcode = this->ideal_Opcode();
6467     int vlen_enc = vector_length_encoding(this);
6468     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc,
6469                  $vtmp$$XMMRegister);
6470   %}
6471   ins_pipe( pipe_slow );
6472 %}
6473 
6474 instruct vshiftL_arith_var_evex(vec dst, vec src, vec shift) %{
6475   predicate(!VectorNode::is_vshift_cnt(n->in(2)) &&
6476             UseAVX > 2);
6477   match(Set dst (RShiftVL src shift));
6478   format %{ "vector_varfshift_long $dst,$src,$shift\t!" %}
6479   ins_encode %{
6480     int opcode = this->ideal_Opcode();
6481     int vlen_enc = vector_length_encoding(this);
6482     __ varshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vlen_enc);
6483   %}
6484   ins_pipe( pipe_slow );
6485 %}
6486 
6487 // --------------------------------- AND --------------------------------------
6488 
6489 instruct vand(vec dst, vec src) %{
6490   predicate(UseAVX == 0);
6491   match(Set dst (AndV dst src));
6492   format %{ "pand    $dst,$src\t! and vectors" %}
6493   ins_encode %{
6494     __ pand($dst$$XMMRegister, $src$$XMMRegister);
6495   %}
6496   ins_pipe( pipe_slow );
6497 %}
6498 
6499 instruct vand_reg(vec dst, vec src1, vec src2) %{
6500   predicate(UseAVX > 0);
6501   match(Set dst (AndV src1 src2));
6502   format %{ "vpand   $dst,$src1,$src2\t! and vectors" %}
6503   ins_encode %{
6504     int vlen_enc = vector_length_encoding(this);
6505     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6506   %}
6507   ins_pipe( pipe_slow );
6508 %}
6509 
6510 instruct vand_mem(vec dst, vec src, memory mem) %{
6511   predicate((UseAVX > 0) &&
6512             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6513   match(Set dst (AndV src (LoadVector mem)));
6514   format %{ "vpand   $dst,$src,$mem\t! and vectors" %}
6515   ins_encode %{
6516     int vlen_enc = vector_length_encoding(this);
6517     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6518   %}
6519   ins_pipe( pipe_slow );
6520 %}
6521 
6522 // --------------------------------- OR ---------------------------------------
6523 
6524 instruct vor(vec dst, vec src) %{
6525   predicate(UseAVX == 0);
6526   match(Set dst (OrV dst src));
6527   format %{ "por     $dst,$src\t! or vectors" %}
6528   ins_encode %{
6529     __ por($dst$$XMMRegister, $src$$XMMRegister);
6530   %}
6531   ins_pipe( pipe_slow );
6532 %}
6533 
6534 instruct vor_reg(vec dst, vec src1, vec src2) %{
6535   predicate(UseAVX > 0);
6536   match(Set dst (OrV src1 src2));
6537   format %{ "vpor    $dst,$src1,$src2\t! or vectors" %}
6538   ins_encode %{
6539     int vlen_enc = vector_length_encoding(this);
6540     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6541   %}
6542   ins_pipe( pipe_slow );
6543 %}
6544 
6545 instruct vor_mem(vec dst, vec src, memory mem) %{
6546   predicate((UseAVX > 0) &&
6547             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6548   match(Set dst (OrV src (LoadVector mem)));
6549   format %{ "vpor    $dst,$src,$mem\t! or vectors" %}
6550   ins_encode %{
6551     int vlen_enc = vector_length_encoding(this);
6552     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6553   %}
6554   ins_pipe( pipe_slow );
6555 %}
6556 
6557 // --------------------------------- XOR --------------------------------------
6558 
6559 instruct vxor(vec dst, vec src) %{
6560   predicate(UseAVX == 0);
6561   match(Set dst (XorV dst src));
6562   format %{ "pxor    $dst,$src\t! xor vectors" %}
6563   ins_encode %{
6564     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
6565   %}
6566   ins_pipe( pipe_slow );
6567 %}
6568 
6569 instruct vxor_reg(vec dst, vec src1, vec src2) %{
6570   predicate(UseAVX > 0);
6571   match(Set dst (XorV src1 src2));
6572   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors" %}
6573   ins_encode %{
6574     int vlen_enc = vector_length_encoding(this);
6575     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
6576   %}
6577   ins_pipe( pipe_slow );
6578 %}
6579 
6580 instruct vxor_mem(vec dst, vec src, memory mem) %{
6581   predicate((UseAVX > 0) &&
6582             (Matcher::vector_length_in_bytes(n->in(1)) > 8));
6583   match(Set dst (XorV src (LoadVector mem)));
6584   format %{ "vpxor   $dst,$src,$mem\t! xor vectors" %}
6585   ins_encode %{
6586     int vlen_enc = vector_length_encoding(this);
6587     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vlen_enc);
6588   %}
6589   ins_pipe( pipe_slow );
6590 %}
6591 
6592 // --------------------------------- VectorCast --------------------------------------
6593 
6594 instruct vcastBtoX(vec dst, vec src) %{
6595   match(Set dst (VectorCastB2X src));
6596   format %{ "vector_cast_b2x $dst,$src\t!" %}
6597   ins_encode %{
6598     assert(UseAVX > 0, "required");
6599 
6600     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6601     int vlen_enc = vector_length_encoding(this);
6602     switch (to_elem_bt) {
6603       case T_SHORT:
6604         __ vpmovsxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6605         break;
6606       case T_INT:
6607         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6608         break;
6609       case T_FLOAT:
6610         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6611         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6612         break;
6613       case T_LONG:
6614         __ vpmovsxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6615         break;
6616       case T_DOUBLE:
6617         __ vpmovsxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6618         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6619         break;
6620 
6621       default: assert(false, "%s", type2name(to_elem_bt));
6622     }
6623   %}
6624   ins_pipe( pipe_slow );
6625 %}
6626 
6627 instruct castStoX(vec dst, vec src, rRegP scratch) %{
6628   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6629             Matcher::vector_length(n->in(1)) <= 8 && // src
6630             Matcher::vector_element_basic_type(n) == T_BYTE);
6631   effect(TEMP scratch);
6632   match(Set dst (VectorCastS2X src));
6633   format %{ "vector_cast_s2x $dst,$src\t! using $scratch as TEMP" %}
6634   ins_encode %{
6635     assert(UseAVX > 0, "required");
6636 
6637     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), 0, $scratch$$Register);
6638     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0);
6639   %}
6640   ins_pipe( pipe_slow );
6641 %}
6642 
6643 instruct vcastStoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6644   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vlbw()) &&
6645             Matcher::vector_length(n->in(1)) == 16 && // src
6646             Matcher::vector_element_basic_type(n) == T_BYTE);
6647   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6648   match(Set dst (VectorCastS2X src));
6649   format %{ "vector_cast_s2x $dst,$src\t! using $vtmp, $scratch as TEMP" %}
6650   ins_encode %{
6651     assert(UseAVX > 0, "required");
6652 
6653     int vlen_enc = vector_length_encoding(Matcher::vector_length_in_bytes(this, $src));
6654     __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
6655     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
6656     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0);
6657   %}
6658   ins_pipe( pipe_slow );
6659 %}
6660 
6661 instruct vcastStoX_evex(vec dst, vec src) %{
6662   predicate((UseAVX > 2 && VM_Version::supports_avx512vlbw()) ||
6663             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
6664   match(Set dst (VectorCastS2X src));
6665   format %{ "vector_cast_s2x $dst,$src\t!" %}
6666   ins_encode %{
6667     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6668     int src_vlen_enc = vector_length_encoding(this, $src);
6669     int vlen_enc = vector_length_encoding(this);
6670     switch (to_elem_bt) {
6671       case T_BYTE:
6672         if (!VM_Version::supports_avx512vl()) {
6673           vlen_enc = Assembler::AVX_512bit;
6674         }
6675         __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6676         break;
6677       case T_INT:
6678         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6679         break;
6680       case T_FLOAT:
6681         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6682         __ vcvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6683         break;
6684       case T_LONG:
6685         __ vpmovsxwq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6686         break;
6687       case T_DOUBLE:
6688         __ vpmovsxwd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6689         __ vcvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6690         break;
6691       default:
6692         ShouldNotReachHere();
6693     }
6694   %}
6695   ins_pipe( pipe_slow );
6696 %}
6697 
6698 instruct castItoX(vec dst, vec src, rRegP scratch) %{
6699   predicate(UseAVX <= 2 &&
6700             (Matcher::vector_length_in_bytes(n->in(1)) <= 16) &&
6701             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
6702   match(Set dst (VectorCastI2X src));
6703   format %{ "vector_cast_i2x $dst,$src\t! using $scratch as TEMP" %}
6704   effect(TEMP scratch);
6705   ins_encode %{
6706     assert(UseAVX > 0, "required");
6707 
6708     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6709     int vlen_enc = vector_length_encoding(this, $src);
6710 
6711     if (to_elem_bt == T_BYTE) {
6712       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
6713       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6714       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6715     } else {
6716       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
6717       __ vpand($dst$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6718       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6719     }
6720   %}
6721   ins_pipe( pipe_slow );
6722 %}
6723 
6724 instruct vcastItoX(vec dst, vec src, vec vtmp, rRegP scratch) %{
6725   predicate(UseAVX <= 2 &&
6726             (Matcher::vector_length_in_bytes(n->in(1)) == 32) &&
6727             (Matcher::vector_length_in_bytes(n) < Matcher::vector_length_in_bytes(n->in(1)))); // dst < src
6728   match(Set dst (VectorCastI2X src));
6729   format %{ "vector_cast_i2x $dst,$src\t! using $vtmp and $scratch as TEMP" %}
6730   effect(TEMP dst, TEMP vtmp, TEMP scratch);
6731   ins_encode %{
6732     assert(UseAVX > 0, "required");
6733 
6734     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6735     int vlen_enc = vector_length_encoding(this, $src);
6736 
6737     if (to_elem_bt == T_BYTE) {
6738       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_byte_mask()), vlen_enc, $scratch$$Register);
6739       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
6740       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6741       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6742     } else {
6743       assert(to_elem_bt == T_SHORT, "%s", type2name(to_elem_bt));
6744       __ vpand($vtmp$$XMMRegister, $src$$XMMRegister, ExternalAddress(vector_int_to_short_mask()), vlen_enc, $scratch$$Register);
6745       __ vextracti128($dst$$XMMRegister, $vtmp$$XMMRegister, 0x1);
6746       __ vpackusdw($dst$$XMMRegister, $vtmp$$XMMRegister, $dst$$XMMRegister, vlen_enc);
6747     }
6748   %}
6749   ins_pipe( pipe_slow );
6750 %}
6751 
6752 instruct vcastItoX_evex(vec dst, vec src) %{
6753   predicate(UseAVX > 2 ||
6754             (Matcher::vector_length_in_bytes(n) >= Matcher::vector_length_in_bytes(n->in(1)))); // dst >= src
6755   match(Set dst (VectorCastI2X src));
6756   format %{ "vector_cast_i2x $dst,$src\t!" %}
6757   ins_encode %{
6758     assert(UseAVX > 0, "required");
6759 
6760     BasicType dst_elem_bt = Matcher::vector_element_basic_type(this);
6761     int src_vlen_enc = vector_length_encoding(this, $src);
6762     int dst_vlen_enc = vector_length_encoding(this);
6763     switch (dst_elem_bt) {
6764       case T_BYTE:
6765         if (!VM_Version::supports_avx512vl()) {
6766           src_vlen_enc = Assembler::AVX_512bit;
6767         }
6768         __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6769         break;
6770       case T_SHORT:
6771         if (!VM_Version::supports_avx512vl()) {
6772           src_vlen_enc = Assembler::AVX_512bit;
6773         }
6774         __ evpmovdw($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
6775         break;
6776       case T_FLOAT:
6777         __ vcvtdq2ps($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
6778         break;
6779       case T_LONG:
6780         __ vpmovsxdq($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
6781         break;
6782       case T_DOUBLE:
6783         __ vcvtdq2pd($dst$$XMMRegister, $src$$XMMRegister, dst_vlen_enc);
6784         break;
6785       default:
6786         ShouldNotReachHere();
6787     }
6788   %}
6789   ins_pipe( pipe_slow );
6790 %}
6791 
6792 instruct vcastLtoBS(vec dst, vec src, rRegP scratch) %{
6793   predicate((Matcher::vector_element_basic_type(n) == T_BYTE || Matcher::vector_element_basic_type(n) == T_SHORT) &&
6794             UseAVX <= 2);
6795   match(Set dst (VectorCastL2X src));
6796   effect(TEMP scratch);
6797   format %{ "vector_cast_l2x  $dst,$src\t! using $scratch as TEMP" %}
6798   ins_encode %{
6799     assert(UseAVX > 0, "required");
6800 
6801     int vlen = Matcher::vector_length_in_bytes(this, $src);
6802     BasicType to_elem_bt  = Matcher::vector_element_basic_type(this);
6803     AddressLiteral mask_addr = (to_elem_bt == T_BYTE) ? ExternalAddress(vector_int_to_byte_mask())
6804                                                       : ExternalAddress(vector_int_to_short_mask());
6805     if (vlen <= 16) {
6806       __ vpshufd($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_128bit);
6807       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
6808       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6809     } else {
6810       assert(vlen <= 32, "required");
6811       __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, Assembler::AVX_256bit);
6812       __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, Assembler::AVX_256bit);
6813       __ vpand($dst$$XMMRegister, $dst$$XMMRegister, mask_addr, Assembler::AVX_128bit, $scratch$$Register);
6814       __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6815     }
6816     if (to_elem_bt == T_BYTE) {
6817       __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, Assembler::AVX_128bit);
6818     }
6819   %}
6820   ins_pipe( pipe_slow );
6821 %}
6822 
6823 instruct vcastLtoX_evex(vec dst, vec src) %{
6824   predicate(UseAVX > 2 ||
6825             (Matcher::vector_element_basic_type(n) == T_INT ||
6826              Matcher::vector_element_basic_type(n) == T_FLOAT ||
6827              Matcher::vector_element_basic_type(n) == T_DOUBLE));
6828   match(Set dst (VectorCastL2X src));
6829   format %{ "vector_cast_l2x  $dst,$src\t!" %}
6830   ins_encode %{
6831     BasicType to_elem_bt = Matcher::vector_element_basic_type(this);
6832     int vlen = Matcher::vector_length_in_bytes(this, $src);
6833     int vlen_enc = vector_length_encoding(this, $src);
6834     switch (to_elem_bt) {
6835       case T_BYTE:
6836         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
6837           vlen_enc = Assembler::AVX_512bit;
6838         }
6839         __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6840         break;
6841       case T_SHORT:
6842         if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
6843           vlen_enc = Assembler::AVX_512bit;
6844         }
6845         __ evpmovqw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6846         break;
6847       case T_INT:
6848         if (vlen == 8) {
6849           if ($dst$$XMMRegister != $src$$XMMRegister) {
6850             __ movflt($dst$$XMMRegister, $src$$XMMRegister);
6851           }
6852         } else if (vlen == 16) {
6853           __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 8);
6854         } else if (vlen == 32) {
6855           if (UseAVX > 2) {
6856             if (!VM_Version::supports_avx512vl()) {
6857               vlen_enc = Assembler::AVX_512bit;
6858             }
6859             __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6860           } else {
6861             __ vpermilps($dst$$XMMRegister, $src$$XMMRegister, 8, vlen_enc);
6862             __ vpermpd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
6863           }
6864         } else { // vlen == 64
6865           __ evpmovqd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6866         }
6867         break;
6868       case T_FLOAT:
6869         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
6870         __ evcvtqq2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6871         break;
6872       case T_DOUBLE:
6873         assert(UseAVX > 2 && VM_Version::supports_avx512dq(), "required");
6874         __ evcvtqq2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6875         break;
6876 
6877       default: assert(false, "%s", type2name(to_elem_bt));
6878     }
6879   %}
6880   ins_pipe( pipe_slow );
6881 %}
6882 
6883 instruct vcastFtoD_reg(vec dst, vec src) %{
6884   predicate(Matcher::vector_element_basic_type(n) == T_DOUBLE);
6885   match(Set dst (VectorCastF2X src));
6886   format %{ "vector_cast_f2x  $dst,$src\t!" %}
6887   ins_encode %{
6888     int vlen_enc = vector_length_encoding(this);
6889     __ vcvtps2pd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6890   %}
6891   ins_pipe( pipe_slow );
6892 %}
6893 
6894 instruct vcastDtoF_reg(vec dst, vec src) %{
6895   predicate(Matcher::vector_element_basic_type(n) == T_FLOAT);
6896   match(Set dst (VectorCastD2X src));
6897   format %{ "vector_cast_d2x  $dst,$src\t!" %}
6898   ins_encode %{
6899     int vlen_enc = vector_length_encoding(this, $src);
6900     __ vcvtpd2ps($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
6901   %}
6902   ins_pipe( pipe_slow );
6903 %}
6904 
6905 // --------------------------------- VectorMaskCmp --------------------------------------
6906 
6907 instruct vcmpFD(legVec dst, legVec src1, legVec src2, immI8 cond) %{
6908   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
6909             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
6910             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
6911   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6912   format %{ "vector_compare $dst,$src1,$src2,$cond\t!" %}
6913   ins_encode %{
6914     int vlen_enc = vector_length_encoding(this, $src1);
6915     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
6916     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
6917       __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6918     } else {
6919       __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6920     }
6921   %}
6922   ins_pipe( pipe_slow );
6923 %}
6924 
6925 instruct evcmpFD(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
6926   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 && // src1
6927             is_floating_point_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1 T_FLOAT, T_DOUBLE
6928   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6929   effect(TEMP scratch, TEMP ktmp);
6930   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6931   ins_encode %{
6932     int vlen_enc = Assembler::AVX_512bit;
6933     Assembler::ComparisonPredicateFP cmp = booltest_pred_to_comparison_pred_fp($cond$$constant);
6934     KRegister mask = k0; // The comparison itself is not being masked.
6935     if (Matcher::vector_element_basic_type(this, $src1) == T_FLOAT) {
6936       __ evcmpps($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6937       __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
6938     } else {
6939       __ evcmppd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen_enc);
6940       __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), false, vlen_enc, $scratch$$Register);
6941     }
6942   %}
6943   ins_pipe( pipe_slow );
6944 %}
6945 
6946 instruct vcmp(legVec dst, legVec src1, legVec src2, immI8 cond, rRegP scratch) %{
6947   predicate((UseAVX <= 2 || !VM_Version::supports_avx512vl()) &&
6948             !is_unsigned_booltest_pred(n->in(2)->get_int()) &&
6949             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  4 && // src1
6950             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 32 && // src1
6951             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
6952   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6953   effect(TEMP scratch);
6954   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6955   ins_encode %{
6956     int vlen_enc = vector_length_encoding(this, $src1);
6957     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
6958     Assembler::Width ww = widthForType(Matcher::vector_element_basic_type(this, $src1));
6959     __ vpcmpCCW($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, ww, vlen_enc, $scratch$$Register);
6960   %}
6961   ins_pipe( pipe_slow );
6962 %}
6963 
6964 instruct vcmpu(legVec dst, legVec src1, legVec src2, immI8 cond, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
6965   predicate((UseAVX == 2 || !VM_Version::supports_avx512vl()) &&
6966             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
6967             Matcher::vector_length_in_bytes(n->in(1)->in(1)) >=  8 && // src1
6968             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <= 16 && // src1
6969             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
6970   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6971   effect(TEMP vtmp1, TEMP vtmp2, TEMP scratch);
6972   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6973   ins_encode %{
6974     int vlen = Matcher::vector_length_in_bytes(this, $src1);
6975     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
6976     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
6977     __ vpcmpu(bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen, $vtmp1$$XMMRegister,
6978               $vtmp2$$XMMRegister, $scratch$$Register);
6979   %}
6980   ins_pipe( pipe_slow );
6981 %}
6982 
6983 instruct vcmpu32(legVec dst, legVec src1, legVec src2, immI8 cond, legVec vtmp1, legVec vtmp2, legVec vtmp3, rRegP scratch) %{
6984   predicate((UseAVX == 2 || !VM_Version::supports_avx512vl()) &&
6985             is_unsigned_booltest_pred(n->in(2)->get_int()) &&
6986             Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 32 && // src1
6987             is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
6988   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
6989   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP scratch);
6990   format %{ "vector_compareu $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
6991   ins_encode %{
6992     int vlen = Matcher::vector_length_in_bytes(this, $src1);
6993     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
6994     BasicType bt = Matcher::vector_element_basic_type(this, $src1);
6995     __ vpcmpu32(bt, $dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cmp, vlen, $vtmp1$$XMMRegister,
6996                 $vtmp2$$XMMRegister, $vtmp3$$XMMRegister, $scratch$$Register);
6997   %}
6998   ins_pipe( pipe_slow );
6999 %}
7000 
7001 instruct evcmp(vec dst, vec src1, vec src2, immI8 cond, rRegP scratch, kReg ktmp) %{
7002   predicate(UseAVX > 2 &&
7003             (VM_Version::supports_avx512vl() ||
7004              Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64) && // src1
7005              is_integral_type(Matcher::vector_element_basic_type(n->in(1)->in(1)))); // src1
7006   match(Set dst (VectorMaskCmp (Binary src1 src2) cond));
7007   effect(TEMP scratch, TEMP ktmp);
7008   format %{ "vector_compare $dst,$src1,$src2,$cond\t! using $scratch as TEMP" %}
7009   ins_encode %{
7010     assert(UseAVX > 2, "required");
7011 
7012     int vlen_enc = vector_length_encoding(this, $src1);
7013     Assembler::ComparisonPredicate cmp = booltest_pred_to_comparison_pred($cond$$constant);
7014     bool is_unsigned = is_unsigned_booltest_pred($cond$$constant);
7015     KRegister mask = k0; // The comparison itself is not being masked.
7016     bool merge = false;
7017     BasicType src1_elem_bt = Matcher::vector_element_basic_type(this, $src1);
7018 
7019     switch (src1_elem_bt) {
7020       case T_BYTE: {
7021         __ evpcmpb($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7022         __ evmovdqub($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7023         break;
7024       }
7025       case T_SHORT: {
7026         __ evpcmpw($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7027         __ evmovdquw($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7028         break;
7029       }
7030       case T_INT: {
7031         __ evpcmpd($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7032         __ evmovdqul($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7033         break;
7034       }
7035       case T_LONG: {
7036         __ evpcmpq($ktmp$$KRegister, mask, $src1$$XMMRegister, $src2$$XMMRegister, cmp, !is_unsigned, vlen_enc);
7037         __ evmovdquq($dst$$XMMRegister, $ktmp$$KRegister, ExternalAddress(vector_all_bits_set()), merge, vlen_enc, $scratch$$Register);
7038         break;
7039       }
7040       default: assert(false, "%s", type2name(src1_elem_bt));
7041     }
7042   %}
7043   ins_pipe( pipe_slow );
7044 %}
7045 
7046 // Extract
7047 
7048 instruct extractI(rRegI dst, legVec src, immU8 idx) %{
7049   predicate(Matcher::vector_length_in_bytes(n->in(1)) <= 16); // src
7050   match(Set dst (ExtractI src idx));
7051   match(Set dst (ExtractS src idx));
7052 #ifdef _LP64
7053   match(Set dst (ExtractB src idx));
7054 #endif
7055   format %{ "extractI $dst,$src,$idx\t!" %}
7056   ins_encode %{
7057     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7058 
7059     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7060     __ get_elem(elem_bt, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7061   %}
7062   ins_pipe( pipe_slow );
7063 %}
7064 
7065 instruct vextractI(rRegI dst, legVec src, immI idx, legVec vtmp) %{
7066   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 32 || // src
7067             Matcher::vector_length_in_bytes(n->in(1)) == 64);  // src
7068   match(Set dst (ExtractI src idx));
7069   match(Set dst (ExtractS src idx));
7070 #ifdef _LP64
7071   match(Set dst (ExtractB src idx));
7072 #endif
7073   effect(TEMP vtmp);
7074   format %{ "vextractI $dst,$src,$idx\t! using $vtmp as TEMP" %}
7075   ins_encode %{
7076     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7077 
7078     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src);
7079     XMMRegister lane_xmm = __ get_lane(elem_bt, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7080     __ get_elem(elem_bt, $dst$$Register, lane_xmm, $idx$$constant);
7081   %}
7082   ins_pipe( pipe_slow );
7083 %}
7084 
7085 #ifdef _LP64
7086 instruct extractL(rRegL dst, legVec src, immU8 idx) %{
7087   predicate(Matcher::vector_length(n->in(1)) <= 2); // src
7088   match(Set dst (ExtractL src idx));
7089   format %{ "extractL $dst,$src,$idx\t!" %}
7090   ins_encode %{
7091     assert(UseSSE >= 4, "required");
7092     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7093 
7094     __ get_elem(T_LONG, $dst$$Register, $src$$XMMRegister, $idx$$constant);
7095   %}
7096   ins_pipe( pipe_slow );
7097 %}
7098 
7099 instruct vextractL(rRegL dst, legVec src, immU8 idx, legVec vtmp) %{
7100   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7101             Matcher::vector_length(n->in(1)) == 8);  // src
7102   match(Set dst (ExtractL src idx));
7103   effect(TEMP vtmp);
7104   format %{ "vextractL $dst,$src,$idx\t! using $vtmp as TEMP" %}
7105   ins_encode %{
7106     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7107 
7108     XMMRegister lane_reg = __ get_lane(T_LONG, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7109     __ get_elem(T_LONG, $dst$$Register, lane_reg, $idx$$constant);
7110   %}
7111   ins_pipe( pipe_slow );
7112 %}
7113 #endif
7114 
7115 instruct extractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7116   predicate(Matcher::vector_length(n->in(1)) <= 4);
7117   match(Set dst (ExtractF src idx));
7118   effect(TEMP dst, TEMP tmp, TEMP vtmp);
7119   format %{ "extractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7120   ins_encode %{
7121     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7122 
7123     __ get_elem(T_FLOAT, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant, $tmp$$Register, $vtmp$$XMMRegister);
7124   %}
7125   ins_pipe( pipe_slow );
7126 %}
7127 
7128 instruct vextractF(legRegF dst, legVec src, immU8 idx, rRegI tmp, legVec vtmp) %{
7129   predicate(Matcher::vector_length(n->in(1)/*src*/) == 8 ||
7130             Matcher::vector_length(n->in(1)/*src*/) == 16);
7131   match(Set dst (ExtractF src idx));
7132   effect(TEMP tmp, TEMP vtmp);
7133   format %{ "vextractF $dst,$src,$idx\t! using $tmp, $vtmp as TEMP" %}
7134   ins_encode %{
7135     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7136 
7137     XMMRegister lane_reg = __ get_lane(T_FLOAT, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7138     __ get_elem(T_FLOAT, $dst$$XMMRegister, lane_reg, $idx$$constant, $tmp$$Register);
7139   %}
7140   ins_pipe( pipe_slow );
7141 %}
7142 
7143 instruct extractD(legRegD dst, legVec src, immU8 idx) %{
7144   predicate(Matcher::vector_length(n->in(1)) == 2); // src
7145   match(Set dst (ExtractD src idx));
7146   format %{ "extractD $dst,$src,$idx\t!" %}
7147   ins_encode %{
7148     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7149 
7150     __ get_elem(T_DOUBLE, $dst$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7151   %}
7152   ins_pipe( pipe_slow );
7153 %}
7154 
7155 instruct vextractD(legRegD dst, legVec src, immU8 idx, legVec vtmp) %{
7156   predicate(Matcher::vector_length(n->in(1)) == 4 || // src
7157             Matcher::vector_length(n->in(1)) == 8);  // src
7158   match(Set dst (ExtractD src idx));
7159   effect(TEMP vtmp);
7160   format %{ "vextractD $dst,$src,$idx\t! using $vtmp as TEMP" %}
7161   ins_encode %{
7162     assert($idx$$constant < (int)Matcher::vector_length(this, $src), "out of bounds");
7163 
7164     XMMRegister lane_reg = __ get_lane(T_DOUBLE, $vtmp$$XMMRegister, $src$$XMMRegister, $idx$$constant);
7165     __ get_elem(T_DOUBLE, $dst$$XMMRegister, lane_reg, $idx$$constant);
7166   %}
7167   ins_pipe( pipe_slow );
7168 %}
7169 
7170 // --------------------------------- Vector Blend --------------------------------------
7171 
7172 instruct blendvp(vec dst, vec src, vec mask, rxmm0 tmp) %{
7173   predicate(UseAVX == 0);
7174   match(Set dst (VectorBlend (Binary dst src) mask));
7175   format %{ "vector_blend  $dst,$src,$mask\t! using $tmp as TEMP" %}
7176   effect(TEMP tmp);
7177   ins_encode %{
7178     assert(UseSSE >= 4, "required");
7179 
7180     if ($mask$$XMMRegister != $tmp$$XMMRegister) {
7181       __ movdqu($tmp$$XMMRegister, $mask$$XMMRegister);
7182     }
7183     __ pblendvb($dst$$XMMRegister, $src$$XMMRegister); // uses xmm0 as mask
7184   %}
7185   ins_pipe( pipe_slow );
7186 %}
7187 
7188 instruct vblendvpI(legVec dst, legVec src1, legVec src2, legVec mask) %{
7189   predicate(UseAVX > 0 &&
7190             Matcher::vector_length_in_bytes(n) <= 32 &&
7191             is_integral_type(Matcher::vector_element_basic_type(n)));
7192   match(Set dst (VectorBlend (Binary src1 src2) mask));
7193   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7194   ins_encode %{
7195     int vlen_enc = vector_length_encoding(this);
7196     __ vpblendvb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7197   %}
7198   ins_pipe( pipe_slow );
7199 %}
7200 
7201 instruct vblendvpFD(legVec dst, legVec src1, legVec src2, legVec mask) %{
7202   predicate(UseAVX > 0 &&
7203             Matcher::vector_length_in_bytes(n) <= 32 &&
7204             !is_integral_type(Matcher::vector_element_basic_type(n)));
7205   match(Set dst (VectorBlend (Binary src1 src2) mask));
7206   format %{ "vector_blend  $dst,$src1,$src2,$mask\t!" %}
7207   ins_encode %{
7208     int vlen_enc = vector_length_encoding(this);
7209     __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $mask$$XMMRegister, vlen_enc);
7210   %}
7211   ins_pipe( pipe_slow );
7212 %}
7213 
7214 instruct evblendvp64(vec dst, vec src1, vec src2, vec mask, rRegP scratch, kReg ktmp) %{
7215   predicate(Matcher::vector_length_in_bytes(n) == 64);
7216   match(Set dst (VectorBlend (Binary src1 src2) mask));
7217   format %{ "vector_blend  $dst,$src1,$src2,$mask\t! using $scratch and k2 as TEMP" %}
7218   effect(TEMP scratch, TEMP ktmp);
7219   ins_encode %{
7220      int vlen_enc = Assembler::AVX_512bit;
7221      BasicType elem_bt = Matcher::vector_element_basic_type(this);
7222     __ evpcmp(elem_bt, $ktmp$$KRegister, k0, $mask$$XMMRegister, ExternalAddress(vector_all_bits_set()), Assembler::eq, vlen_enc, $scratch$$Register);
7223     __ evpblend(elem_bt, $dst$$XMMRegister, $ktmp$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, true, vlen_enc);
7224   %}
7225   ins_pipe( pipe_slow );
7226 %}
7227 
7228 // --------------------------------- ABS --------------------------------------
7229 // a = |a|
7230 instruct vabsB_reg(vec dst, vec src) %{
7231   match(Set dst (AbsVB  src));
7232   format %{ "vabsb $dst,$src\t# $dst = |$src| abs packedB" %}
7233   ins_encode %{
7234     uint vlen = Matcher::vector_length(this);
7235     if (vlen <= 16) {
7236       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7237     } else {
7238       int vlen_enc = vector_length_encoding(this);
7239       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7240     }
7241   %}
7242   ins_pipe( pipe_slow );
7243 %}
7244 
7245 instruct vabsS_reg(vec dst, vec src) %{
7246   match(Set dst (AbsVS  src));
7247   format %{ "vabsw $dst,$src\t# $dst = |$src| abs packedS" %}
7248   ins_encode %{
7249     uint vlen = Matcher::vector_length(this);
7250     if (vlen <= 8) {
7251       __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7252     } else {
7253       int vlen_enc = vector_length_encoding(this);
7254       __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7255     }
7256   %}
7257   ins_pipe( pipe_slow );
7258 %}
7259 
7260 instruct vabsI_reg(vec dst, vec src) %{
7261   match(Set dst (AbsVI  src));
7262   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packedI" %}
7263   ins_encode %{
7264     uint vlen = Matcher::vector_length(this);
7265     if (vlen <= 4) {
7266       __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7267     } else {
7268       int vlen_enc = vector_length_encoding(this);
7269       __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7270     }
7271   %}
7272   ins_pipe( pipe_slow );
7273 %}
7274 
7275 instruct vabsL_reg(vec dst, vec src) %{
7276   match(Set dst (AbsVL  src));
7277   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packedL" %}
7278   ins_encode %{
7279     assert(UseAVX > 2, "required");
7280     int vlen_enc = vector_length_encoding(this);
7281     if (!VM_Version::supports_avx512vl()) {
7282       vlen_enc = Assembler::AVX_512bit;
7283     }
7284     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7285   %}
7286   ins_pipe( pipe_slow );
7287 %}
7288 
7289 // --------------------------------- ABSNEG --------------------------------------
7290 
7291 instruct vabsnegF(vec dst, vec src, rRegI scratch) %{
7292   predicate(Matcher::vector_length(n) != 4); // handled by 1-operand instruction vabsneg4F
7293   match(Set dst (AbsVF src));
7294   match(Set dst (NegVF src));
7295   effect(TEMP scratch);
7296   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packedF" %}
7297   ins_cost(150);
7298   ins_encode %{
7299     int opcode = this->ideal_Opcode();
7300     int vlen = Matcher::vector_length(this);
7301     if (vlen == 2) {
7302       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7303     } else {
7304       assert(vlen == 8 || vlen == 16, "required");
7305       int vlen_enc = vector_length_encoding(this);
7306       __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7307     }
7308   %}
7309   ins_pipe( pipe_slow );
7310 %}
7311 
7312 instruct vabsneg4F(vec dst, rRegI scratch) %{
7313   predicate(Matcher::vector_length(n) == 4);
7314   match(Set dst (AbsVF dst));
7315   match(Set dst (NegVF dst));
7316   effect(TEMP scratch);
7317   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
7318   ins_cost(150);
7319   ins_encode %{
7320     int opcode = this->ideal_Opcode();
7321     __ vabsnegf(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $scratch$$Register);
7322   %}
7323   ins_pipe( pipe_slow );
7324 %}
7325 
7326 instruct vabsnegD(vec dst, vec src, rRegI scratch) %{
7327   match(Set dst (AbsVD  src));
7328   match(Set dst (NegVD  src));
7329   effect(TEMP scratch);
7330   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packedD" %}
7331   ins_encode %{
7332     int opcode = this->ideal_Opcode();
7333     uint vlen = Matcher::vector_length(this);
7334     if (vlen == 2) {
7335       assert(UseSSE >= 2, "required");
7336       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $scratch$$Register);
7337     } else {
7338       int vlen_enc = vector_length_encoding(this);
7339       __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc, $scratch$$Register);
7340     }
7341   %}
7342   ins_pipe( pipe_slow );
7343 %}
7344 
7345 //------------------------------------- VectorTest --------------------------------------------
7346 
7347 #ifdef _LP64
7348 instruct vptest_alltrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp1, legVec vtmp2, rFlagsReg cr) %{
7349   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
7350             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
7351             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7352   match(Set dst (VectorTest src1 src2 ));
7353   effect(TEMP vtmp1, TEMP vtmp2, KILL cr);
7354   format %{ "vector_test $dst,$src1, $src2\t! using $vtmp1, $vtmp2 and $cr as TEMP" %}
7355   ins_encode %{
7356     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7357     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
7358     __ setb(Assembler::carrySet, $dst$$Register);
7359     __ movzbl($dst$$Register, $dst$$Register);
7360   %}
7361   ins_pipe( pipe_slow );
7362 %}
7363 
7364 instruct vptest_alltrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7365   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
7366             Matcher::vector_length_in_bytes(n->in(1)) <  64 &&
7367             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7368   match(Set dst (VectorTest src1 src2 ));
7369   effect(KILL cr);
7370   format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %}
7371   ins_encode %{
7372     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7373     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7374     __ setb(Assembler::carrySet, $dst$$Register);
7375     __ movzbl($dst$$Register, $dst$$Register);
7376   %}
7377   ins_pipe( pipe_slow );
7378 %}
7379 
7380 instruct vptest_alltrue_evex(rRegI dst, legVec src1, legVec src2, kReg ktmp, rFlagsReg cr) %{
7381   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 64 &&
7382             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::overflow);
7383   match(Set dst (VectorTest src1 src2 ));
7384   effect(KILL cr, TEMP ktmp);
7385   format %{ "vector_test $dst,$src1, $src2\t! using $cr as TEMP" %}
7386   ins_encode %{
7387     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7388     __ vectortest(BoolTest::overflow, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, $ktmp$$KRegister);
7389     __ setb(Assembler::carrySet, $dst$$Register);
7390     __ movzbl($dst$$Register, $dst$$Register);
7391   %}
7392   ins_pipe( pipe_slow );
7393 %}
7394 
7395 instruct vptest_anytrue_lt16(rRegI dst, legVec src1, legVec src2, legVec vtmp, rFlagsReg cr) %{
7396   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 4 &&
7397             Matcher::vector_length_in_bytes(n->in(1)) < 16 &&
7398             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7399   match(Set dst (VectorTest src1 src2 ));
7400   effect(TEMP vtmp, KILL cr);
7401   format %{ "vector_test_any_true $dst,$src1,$src2\t! using $vtmp, $cr as TEMP" %}
7402   ins_encode %{
7403     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7404     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7405     __ setb(Assembler::notZero, $dst$$Register);
7406     __ movzbl($dst$$Register, $dst$$Register);
7407   %}
7408   ins_pipe( pipe_slow );
7409 %}
7410 
7411 instruct vptest_anytrue(rRegI dst, legVec src1, legVec src2, rFlagsReg cr) %{
7412   predicate(Matcher::vector_length_in_bytes(n->in(1)) >= 16 &&
7413             Matcher::vector_length_in_bytes(n->in(1)) < 64  &&
7414             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7415   match(Set dst (VectorTest src1 src2 ));
7416   effect(KILL cr);
7417   format %{ "vector_test_any_true $dst,$src1,$src2\t! using $cr as TEMP" %}
7418   ins_encode %{
7419     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7420     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7421     __ setb(Assembler::notZero, $dst$$Register);
7422     __ movzbl($dst$$Register, $dst$$Register);
7423   %}
7424   ins_pipe( pipe_slow );
7425 %}
7426 
7427 instruct vptest_anytrue_evex(rRegI dst, legVec src1, legVec src2, kReg ktmp, rFlagsReg cr) %{
7428   predicate(Matcher::vector_length_in_bytes(n->in(1)) == 64 &&
7429             static_cast<const VectorTestNode*>(n)->get_predicate() == BoolTest::ne);
7430   match(Set dst (VectorTest src1 src2 ));
7431   effect(KILL cr, TEMP ktmp);
7432   format %{ "vector_test_any_true $dst,$src1,$src2\t! using $cr as TEMP" %}
7433   ins_encode %{
7434     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7435     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, $ktmp$$KRegister);
7436     __ setb(Assembler::notZero, $dst$$Register);
7437     __ movzbl($dst$$Register, $dst$$Register);
7438   %}
7439   ins_pipe( pipe_slow );
7440 %}
7441 
7442 instruct cmpvptest_anytrue_lt16(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero, legVec vtmp) %{
7443   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 4 &&
7444             Matcher::vector_length_in_bytes(n->in(1)->in(1)) < 16 &&
7445             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7446   match(Set cr (CmpI (VectorTest src1 src2) zero));
7447   effect(TEMP vtmp);
7448   format %{ "cmp_vector_test_any_true $src1,$src2\t! using $vtmp as TEMP" %}
7449   ins_encode %{
7450     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7451     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
7452   %}
7453   ins_pipe( pipe_slow );
7454 %}
7455 
7456 instruct cmpvptest_anytrue(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero) %{
7457   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) >= 16 &&
7458             Matcher::vector_length_in_bytes(n->in(1)->in(1)) <  64 &&
7459             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7460   match(Set cr (CmpI (VectorTest src1 src2) zero));
7461   format %{ "cmp_vector_test_any_true $src1,$src2\t!" %}
7462   ins_encode %{
7463     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7464     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, knoreg);
7465   %}
7466   ins_pipe( pipe_slow );
7467 %}
7468 
7469 instruct cmpvptest_anytrue_evex(rFlagsReg cr, legVec src1, legVec src2, immI_0 zero, kReg ktmp) %{
7470   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) == 64 &&
7471             static_cast<const VectorTestNode*>(n->in(1))->get_predicate() == BoolTest::ne);
7472   match(Set cr (CmpI (VectorTest src1 src2) zero));
7473   effect(TEMP ktmp);
7474   format %{ "cmp_vector_test_any_true $src1,$src2\t!" %}
7475   ins_encode %{
7476     int vlen = Matcher::vector_length_in_bytes(this, $src1);
7477     __ vectortest(BoolTest::ne, vlen, $src1$$XMMRegister, $src2$$XMMRegister, xnoreg, xnoreg, $ktmp$$KRegister);
7478   %}
7479   ins_pipe( pipe_slow );
7480 %}
7481 #endif
7482 
7483 //------------------------------------- LoadMask --------------------------------------------
7484 
7485 instruct loadMask(legVec dst, legVec src) %{
7486   predicate(!VM_Version::supports_avx512vlbw());
7487   match(Set dst (VectorLoadMask src));
7488   effect(TEMP dst);
7489   format %{ "vector_loadmask_byte $dst,$src\n\t" %}
7490   ins_encode %{
7491     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
7492     BasicType elem_bt = Matcher::vector_element_basic_type(this);
7493 
7494     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, true);
7495   %}
7496   ins_pipe( pipe_slow );
7497 %}
7498 
7499 instruct loadMask_evex(vec dst, vec src) %{
7500   predicate(VM_Version::supports_avx512vlbw());
7501   match(Set dst (VectorLoadMask src));
7502   effect(TEMP dst);
7503   format %{ "vector_loadmask_byte $dst,$src\n\t" %}
7504   ins_encode %{
7505     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
7506     BasicType elem_bt = Matcher::vector_element_basic_type(this);
7507 
7508     __ load_vector_mask($dst$$XMMRegister, $src$$XMMRegister, vlen_in_bytes, elem_bt, false);
7509   %}
7510   ins_pipe( pipe_slow );
7511 %}
7512 
7513 //------------------------------------- StoreMask --------------------------------------------
7514 
7515 instruct storeMask1B(vec dst, vec src, immI_1 size) %{
7516   predicate(Matcher::vector_length(n) < 64 || VM_Version::supports_avx512vlbw());
7517   match(Set dst (VectorStoreMask src size));
7518   format %{ "vector_store_mask $dst,$src\t!" %}
7519   ins_encode %{
7520     assert(UseSSE >= 3, "required");
7521     if (Matcher::vector_length_in_bytes(this) <= 16) {
7522       __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
7523     } else {
7524       assert(UseAVX >= 2, "required");
7525       int src_vlen_enc = vector_length_encoding(this, $src);
7526       __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7527     }
7528   %}
7529   ins_pipe( pipe_slow );
7530 %}
7531 
7532 instruct storeMask2B(vec dst, vec src, immI_2 size) %{
7533   predicate(Matcher::vector_length(n) <= 8);
7534   match(Set dst (VectorStoreMask src size));
7535   format %{ "vector_store_mask $dst,$src\n\t" %}
7536   ins_encode %{
7537     assert(UseSSE >= 3, "required");
7538     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
7539     __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7540   %}
7541   ins_pipe( pipe_slow );
7542 %}
7543 
7544 instruct vstoreMask2B(vec dst, vec src, immI_2 size) %{
7545   predicate(Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
7546   match(Set dst (VectorStoreMask src size));
7547   effect(TEMP dst);
7548   format %{ "vector_store_mask $dst,$src\t!" %}
7549   ins_encode %{
7550     int vlen_enc = Assembler::AVX_128bit;
7551     __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
7552     __ vpacksswb($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister,vlen_enc);
7553     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7554   %}
7555   ins_pipe( pipe_slow );
7556 %}
7557 
7558 instruct vstoreMask2B_evex(vec dst, vec src, immI_2 size) %{
7559   predicate(VM_Version::supports_avx512bw());
7560   match(Set dst (VectorStoreMask src size));
7561   format %{ "vector_store_mask $dst,$src\t!" %}
7562   ins_encode %{
7563     int src_vlen_enc = vector_length_encoding(this, $src);
7564     int dst_vlen_enc = vector_length_encoding(this);
7565     __ evpmovwb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7566     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7567   %}
7568   ins_pipe( pipe_slow );
7569 %}
7570 
7571 instruct storeMask4B(vec dst, vec src, immI_4 size) %{
7572   predicate(Matcher::vector_length(n) <= 4 && UseAVX <= 2);
7573   match(Set dst (VectorStoreMask src size));
7574   format %{ "vector_store_mask $dst,$src\t!" %}
7575   ins_encode %{
7576     assert(UseSSE >= 3, "required");
7577     __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
7578     __ packssdw($dst$$XMMRegister, $dst$$XMMRegister);
7579     __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7580   %}
7581   ins_pipe( pipe_slow );
7582 %}
7583 
7584 instruct vstoreMask4B(vec dst, vec src, immI_4 size) %{
7585   predicate(Matcher::vector_length(n) == 8 && UseAVX <= 2);
7586   match(Set dst (VectorStoreMask src size));
7587   format %{ "vector_store_mask $dst,$src\t!" %}
7588   effect(TEMP dst);
7589   ins_encode %{
7590     int vlen_enc = Assembler::AVX_128bit;
7591     __ vextracti128($dst$$XMMRegister, $src$$XMMRegister, 0x1);
7592     __ vpackssdw($dst$$XMMRegister, $src$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7593     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7594     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7595   %}
7596   ins_pipe( pipe_slow );
7597 %}
7598 
7599 instruct vstoreMask4B_evex(vec dst, vec src, immI_4 size) %{
7600   predicate(UseAVX > 2);
7601   match(Set dst (VectorStoreMask src size));
7602   format %{ "vector_store_mask $dst,$src\t!" %}
7603   ins_encode %{
7604     int src_vlen_enc = vector_length_encoding(this, $src);
7605     int dst_vlen_enc = vector_length_encoding(this);
7606     if (!VM_Version::supports_avx512vl()) {
7607       src_vlen_enc = Assembler::AVX_512bit;
7608     }
7609     __ evpmovdb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7610     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7611   %}
7612   ins_pipe( pipe_slow );
7613 %}
7614 
7615 instruct storeMask8B(vec dst, vec src, immI_8 size) %{
7616   predicate(Matcher::vector_length(n) == 2 && UseAVX <= 2);
7617   match(Set dst (VectorStoreMask src size));
7618   format %{ "vector_store_mask $dst,$src\t!" %}
7619   ins_encode %{
7620     assert(UseSSE >= 3, "required");
7621     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x8);
7622     __ packssdw($dst$$XMMRegister, $dst$$XMMRegister);
7623     __ packsswb($dst$$XMMRegister, $dst$$XMMRegister);
7624     __ pabsb($dst$$XMMRegister, $dst$$XMMRegister);
7625   %}
7626   ins_pipe( pipe_slow );
7627 %}
7628 
7629 instruct storeMask8B_avx(vec dst, vec src, immI_8 size, legVec vtmp) %{
7630   predicate(Matcher::vector_length(n) == 4 && UseAVX <= 2);
7631   match(Set dst (VectorStoreMask src size));
7632   format %{ "vector_store_mask $dst,$src\t! using $vtmp as TEMP" %}
7633   effect(TEMP dst, TEMP vtmp);
7634   ins_encode %{
7635     int vlen_enc = Assembler::AVX_128bit;
7636     __ vpshufps($dst$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 0x88, Assembler::AVX_256bit);
7637     __ vextracti128($vtmp$$XMMRegister, $dst$$XMMRegister, 0x1);
7638     __ vblendps($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, 0xC, vlen_enc);
7639     __ vpackssdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7640     __ vpacksswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7641     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
7642   %}
7643   ins_pipe( pipe_slow );
7644 %}
7645 
7646 instruct vstoreMask8B_evex(vec dst, vec src, immI_8 size) %{
7647   predicate(UseAVX > 2);
7648   match(Set dst (VectorStoreMask src size));
7649   format %{ "vector_store_mask $dst,$src\t!" %}
7650   ins_encode %{
7651     int src_vlen_enc = vector_length_encoding(this, $src);
7652     int dst_vlen_enc = vector_length_encoding(this);
7653     if (!VM_Version::supports_avx512vl()) {
7654       src_vlen_enc = Assembler::AVX_512bit;
7655     }
7656     __ evpmovqb($dst$$XMMRegister, $src$$XMMRegister, src_vlen_enc);
7657     __ vpabsb($dst$$XMMRegister, $dst$$XMMRegister, dst_vlen_enc);
7658   %}
7659   ins_pipe( pipe_slow );
7660 %}
7661 
7662 instruct vmaskcast(vec dst) %{
7663   predicate((Matcher::vector_length(n) == Matcher::vector_length(n->in(1))) &&
7664             (Matcher::vector_length_in_bytes(n) == Matcher::vector_length_in_bytes(n->in(1))));
7665   match(Set dst (VectorMaskCast dst));
7666   ins_cost(0);
7667   format %{ "vector_mask_cast $dst" %}
7668   ins_encode %{
7669     // empty
7670   %}
7671   ins_pipe(empty);
7672 %}
7673 
7674 //-------------------------------- Load Iota Indices ----------------------------------
7675 
7676 instruct loadIotaIndices(vec dst, immI_0 src, rRegP scratch) %{
7677   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
7678   match(Set dst (VectorLoadConst src));
7679   effect(TEMP scratch);
7680   format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
7681   ins_encode %{
7682      int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
7683      __ load_iota_indices($dst$$XMMRegister, $scratch$$Register, vlen_in_bytes);
7684   %}
7685   ins_pipe( pipe_slow );
7686 %}
7687 
7688 //-------------------------------- Rearrange ----------------------------------
7689 
7690 // LoadShuffle/Rearrange for Byte
7691 
7692 instruct loadShuffleB(vec dst) %{
7693   predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
7694   match(Set dst (VectorLoadShuffle dst));
7695   format %{ "vector_load_shuffle $dst, $dst" %}
7696   ins_encode %{
7697     // empty
7698   %}
7699   ins_pipe( pipe_slow );
7700 %}
7701 
7702 instruct rearrangeB(vec dst, vec shuffle) %{
7703   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
7704             Matcher::vector_length(n) < 32);
7705   match(Set dst (VectorRearrange dst shuffle));
7706   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7707   ins_encode %{
7708     assert(UseSSE >= 4, "required");
7709     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7710   %}
7711   ins_pipe( pipe_slow );
7712 %}
7713 
7714 instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
7715   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
7716             Matcher::vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
7717   match(Set dst (VectorRearrange src shuffle));
7718   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
7719   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
7720   ins_encode %{
7721     assert(UseAVX >= 2, "required");
7722     // Swap src into vtmp1
7723     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
7724     // Shuffle swapped src to get entries from other 128 bit lane
7725     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7726     // Shuffle original src to get entries from self 128 bit lane
7727     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7728     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
7729     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
7730     // Perform the blend
7731     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
7732   %}
7733   ins_pipe( pipe_slow );
7734 %}
7735 
7736 instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{
7737   predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
7738             Matcher::vector_length(n) >= 32 && VM_Version::supports_avx512_vbmi());
7739   match(Set dst (VectorRearrange src shuffle));
7740   format %{ "vector_rearrange $dst, $shuffle, $src" %}
7741   ins_encode %{
7742     int vlen_enc = vector_length_encoding(this);
7743     __ vpermb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7744   %}
7745   ins_pipe( pipe_slow );
7746 %}
7747 
7748 // LoadShuffle/Rearrange for Short
7749 
7750 instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{
7751   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
7752             Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
7753   match(Set dst (VectorLoadShuffle src));
7754   effect(TEMP dst, TEMP vtmp, TEMP scratch);
7755   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7756   ins_encode %{
7757     // Create a byte shuffle mask from short shuffle mask
7758     // only byte shuffle instruction available on these platforms
7759     int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
7760     if (UseAVX == 0) {
7761       assert(vlen_in_bytes <= 16, "required");
7762       // Multiply each shuffle by two to get byte index
7763       __ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
7764       __ psllw($vtmp$$XMMRegister, 1);
7765 
7766       // Duplicate to create 2 copies of byte index
7767       __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
7768       __ psllw($dst$$XMMRegister, 8);
7769       __ por($dst$$XMMRegister, $vtmp$$XMMRegister);
7770 
7771       // Add one to get alternate byte index
7772       __ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
7773       __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
7774     } else {
7775       assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
7776       int vlen_enc = vector_length_encoding(this);
7777       // Multiply each shuffle by two to get byte index
7778       __ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
7779       __ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
7780 
7781       // Duplicate to create 2 copies of byte index
7782       __ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister,  8, vlen_enc);
7783       __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
7784 
7785       // Add one to get alternate byte index
7786       __ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, $scratch$$Register);
7787     }
7788   %}
7789   ins_pipe( pipe_slow );
7790 %}
7791 
7792 instruct rearrangeS(vec dst, vec shuffle) %{
7793   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
7794             Matcher::vector_length(n) <= 8 && !VM_Version::supports_avx512bw());
7795   match(Set dst (VectorRearrange dst shuffle));
7796   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7797   ins_encode %{
7798     assert(UseSSE >= 4, "required");
7799     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7800   %}
7801   ins_pipe( pipe_slow );
7802 %}
7803 
7804 instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
7805   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
7806             Matcher::vector_length(n) == 16 && !VM_Version::supports_avx512bw());
7807   match(Set dst (VectorRearrange src shuffle));
7808   effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
7809   format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
7810   ins_encode %{
7811     assert(UseAVX >= 2, "required");
7812     // Swap src into vtmp1
7813     __ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
7814     // Shuffle swapped src to get entries from other 128 bit lane
7815     __ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7816     // Shuffle original src to get entries from self 128 bit lane
7817     __ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
7818     // Create a blend mask by setting high bits for entries coming from other lane in shuffle
7819     __ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
7820     // Perform the blend
7821     __ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
7822   %}
7823   ins_pipe( pipe_slow );
7824 %}
7825 
7826 instruct loadShuffleS_evex(vec dst, vec src) %{
7827   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
7828             VM_Version::supports_avx512bw());
7829   match(Set dst (VectorLoadShuffle src));
7830   format %{ "vector_load_shuffle $dst, $src" %}
7831   ins_encode %{
7832     int vlen_enc = vector_length_encoding(this);
7833     if (!VM_Version::supports_avx512vl()) {
7834       vlen_enc = Assembler::AVX_512bit;
7835     }
7836     __ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7837   %}
7838   ins_pipe( pipe_slow );
7839 %}
7840 
7841 instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
7842   predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
7843             VM_Version::supports_avx512bw());
7844   match(Set dst (VectorRearrange src shuffle));
7845   format %{ "vector_rearrange $dst, $shuffle, $src" %}
7846   ins_encode %{
7847     int vlen_enc = vector_length_encoding(this);
7848     if (!VM_Version::supports_avx512vl()) {
7849       vlen_enc = Assembler::AVX_512bit;
7850     }
7851     __ vpermw($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7852   %}
7853   ins_pipe( pipe_slow );
7854 %}
7855 
7856 // LoadShuffle/Rearrange for Integer and Float
7857 
7858 instruct loadShuffleI(vec dst, vec src, vec vtmp, rRegP scratch) %{
7859   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
7860             Matcher::vector_length(n) == 4 && UseAVX < 2);
7861   match(Set dst (VectorLoadShuffle src));
7862   effect(TEMP dst, TEMP vtmp, TEMP scratch);
7863   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7864   ins_encode %{
7865     assert(UseSSE >= 4, "required");
7866 
7867     // Create a byte shuffle mask from int shuffle mask
7868     // only byte shuffle instruction available on these platforms
7869 
7870     // Duplicate and multiply each shuffle by 4
7871     __ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
7872     __ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
7873     __ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
7874     __ psllw($vtmp$$XMMRegister, 2);
7875 
7876     // Duplicate again to create 4 copies of byte index
7877     __ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
7878     __ psllw($dst$$XMMRegister, 8);
7879     __ por($vtmp$$XMMRegister, $dst$$XMMRegister);
7880 
7881     // Add 3,2,1,0 to get alternate byte index
7882     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_int_shufflemask()), $scratch$$Register);
7883     __ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
7884   %}
7885   ins_pipe( pipe_slow );
7886 %}
7887 
7888 instruct rearrangeI(vec dst, vec shuffle) %{
7889  predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
7890            Matcher::vector_length(n) == 4 && UseAVX < 2);
7891   match(Set dst (VectorRearrange dst shuffle));
7892   format %{ "vector_rearrange $dst, $shuffle, $dst" %}
7893   ins_encode %{
7894     assert(UseSSE >= 4, "required");
7895     __ pshufb($dst$$XMMRegister, $shuffle$$XMMRegister);
7896   %}
7897   ins_pipe( pipe_slow );
7898 %}
7899 
7900 instruct loadShuffleI_avx(vec dst, vec src) %{
7901   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
7902             UseAVX >= 2);
7903   match(Set dst (VectorLoadShuffle src));
7904   format %{ "vector_load_shuffle $dst, $src" %}
7905   ins_encode %{
7906   int vlen_enc = vector_length_encoding(this);
7907     __ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7908   %}
7909   ins_pipe( pipe_slow );
7910 %}
7911 
7912 instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
7913   predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
7914             UseAVX >= 2);
7915   match(Set dst (VectorRearrange src shuffle));
7916   format %{ "vector_rearrange $dst, $shuffle, $src" %}
7917   ins_encode %{
7918     int vlen_enc = vector_length_encoding(this);
7919     if (vlen_enc == Assembler::AVX_128bit) {
7920       vlen_enc = Assembler::AVX_256bit;
7921     }
7922     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7923   %}
7924   ins_pipe( pipe_slow );
7925 %}
7926 
7927 // LoadShuffle/Rearrange for Long and Double
7928 
7929 instruct loadShuffleL(vec dst, vec src, vec vtmp, rRegP scratch) %{
7930   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7931             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
7932   match(Set dst (VectorLoadShuffle src));
7933   effect(TEMP dst, TEMP vtmp, TEMP scratch);
7934   format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
7935   ins_encode %{
7936     assert(UseAVX >= 2, "required");
7937 
7938     int vlen_enc = vector_length_encoding(this);
7939     // Create a double word shuffle mask from long shuffle mask
7940     // only double word shuffle instruction available on these platforms
7941 
7942     // Multiply each shuffle by two to get double word index
7943     __ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
7944     __ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
7945 
7946     // Duplicate each double word shuffle
7947     __ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
7948     __ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
7949 
7950     // Add one to get alternate double word index
7951     __ vpaddd($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_long_shufflemask()), vlen_enc, $scratch$$Register);
7952   %}
7953   ins_pipe( pipe_slow );
7954 %}
7955 
7956 instruct rearrangeL(vec dst, vec src, vec shuffle) %{
7957   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7958             Matcher::vector_length(n) < 8 && !VM_Version::supports_avx512vl());
7959   match(Set dst (VectorRearrange src shuffle));
7960   format %{ "vector_rearrange $dst, $shuffle, $src" %}
7961   ins_encode %{
7962     assert(UseAVX >= 2, "required");
7963 
7964     int vlen_enc = vector_length_encoding(this);
7965     __ vpermd($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7966   %}
7967   ins_pipe( pipe_slow );
7968 %}
7969 
7970 instruct loadShuffleL_evex(vec dst, vec src) %{
7971   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7972             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
7973   match(Set dst (VectorLoadShuffle src));
7974   format %{ "vector_load_shuffle $dst, $src" %}
7975   ins_encode %{
7976     assert(UseAVX > 2, "required");
7977 
7978     int vlen_enc = vector_length_encoding(this);
7979     __ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
7980   %}
7981   ins_pipe( pipe_slow );
7982 %}
7983 
7984 instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
7985   predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
7986             (Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
7987   match(Set dst (VectorRearrange src shuffle));
7988   format %{ "vector_rearrange $dst, $shuffle, $src" %}
7989   ins_encode %{
7990     assert(UseAVX > 2, "required");
7991 
7992     int vlen_enc = vector_length_encoding(this);
7993     if (vlen_enc == Assembler::AVX_128bit) {
7994       vlen_enc = Assembler::AVX_256bit;
7995     }
7996     __ vpermq($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, vlen_enc);
7997   %}
7998   ins_pipe( pipe_slow );
7999 %}
8000 
8001 // --------------------------------- FMA --------------------------------------
8002 // a * b + c
8003 
8004 instruct vfmaF_reg(vec a, vec b, vec c) %{
8005   match(Set c (FmaVF  c (Binary a b)));
8006   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8007   ins_cost(150);
8008   ins_encode %{
8009     assert(UseFMA, "not enabled");
8010     int vlen_enc = vector_length_encoding(this);
8011     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8012   %}
8013   ins_pipe( pipe_slow );
8014 %}
8015 
8016 instruct vfmaF_mem(vec a, memory b, vec c) %{
8017   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8018   match(Set c (FmaVF  c (Binary a (LoadVector b))));
8019   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packedF" %}
8020   ins_cost(150);
8021   ins_encode %{
8022     assert(UseFMA, "not enabled");
8023     int vlen_enc = vector_length_encoding(this);
8024     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8025   %}
8026   ins_pipe( pipe_slow );
8027 %}
8028 
8029 instruct vfmaD_reg(vec a, vec b, vec c) %{
8030   match(Set c (FmaVD  c (Binary a b)));
8031   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8032   ins_cost(150);
8033   ins_encode %{
8034     assert(UseFMA, "not enabled");
8035     int vlen_enc = vector_length_encoding(this);
8036     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vlen_enc);
8037   %}
8038   ins_pipe( pipe_slow );
8039 %}
8040 
8041 instruct vfmaD_mem(vec a, memory b, vec c) %{
8042   predicate(Matcher::vector_length_in_bytes(n->in(1)) > 8);
8043   match(Set c (FmaVD  c (Binary a (LoadVector b))));
8044   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packedD" %}
8045   ins_cost(150);
8046   ins_encode %{
8047     assert(UseFMA, "not enabled");
8048     int vlen_enc = vector_length_encoding(this);
8049     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vlen_enc);
8050   %}
8051   ins_pipe( pipe_slow );
8052 %}
8053 
8054 // --------------------------------- Vector Multiply Add --------------------------------------
8055 
8056 instruct vmuladdS2I_reg_sse(vec dst, vec src1) %{
8057   predicate(UseAVX == 0);
8058   match(Set dst (MulAddVS2VI dst src1));
8059   format %{ "pmaddwd $dst,$src1\t! muladd packedStoI" %}
8060   ins_encode %{
8061     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
8062   %}
8063   ins_pipe( pipe_slow );
8064 %}
8065 
8066 instruct vmuladdS2I_reg_avx(vec dst, vec src1, vec src2) %{
8067   predicate(UseAVX > 0);
8068   match(Set dst (MulAddVS2VI src1 src2));
8069   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packedStoI" %}
8070   ins_encode %{
8071     int vlen_enc = vector_length_encoding(this);
8072     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8073   %}
8074   ins_pipe( pipe_slow );
8075 %}
8076 
8077 // --------------------------------- Vector Multiply Add Add ----------------------------------
8078 
8079 instruct vmuladdaddS2I_reg(vec dst, vec src1, vec src2) %{
8080   predicate(VM_Version::supports_avx512_vnni());
8081   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
8082   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packedStoI" %}
8083   ins_encode %{
8084     assert(UseAVX > 2, "required");
8085     int vlen_enc = vector_length_encoding(this);
8086     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vlen_enc);
8087   %}
8088   ins_pipe( pipe_slow );
8089   ins_cost(10);
8090 %}
8091 
8092 // --------------------------------- PopCount --------------------------------------
8093 
8094 instruct vpopcountI(vec dst, vec src) %{
8095   match(Set dst (PopCountVI src));
8096   format %{ "vpopcntd  $dst,$src\t! vector popcount packedI" %}
8097   ins_encode %{
8098     assert(UsePopCountInstruction, "not enabled");
8099 
8100     int vlen_enc = vector_length_encoding(this);
8101     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
8102   %}
8103   ins_pipe( pipe_slow );
8104 %}
8105 
8106 // --------------------------------- Bitwise Ternary Logic ----------------------------------
8107 
8108 instruct vpternlog(vec dst, vec src2, vec src3, immU8 func) %{
8109   match(Set dst (MacroLogicV (Binary dst src2) (Binary src3 func)));
8110   effect(TEMP dst);
8111   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8112   ins_encode %{
8113     int vector_len = vector_length_encoding(this);
8114     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$XMMRegister, vector_len);
8115   %}
8116   ins_pipe( pipe_slow );
8117 %}
8118 
8119 instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
8120   predicate(Matcher::vector_length_in_bytes(n->in(1)->in(1)) > 8);
8121   match(Set dst (MacroLogicV (Binary dst src2) (Binary (LoadVector src3) func)));
8122   effect(TEMP dst);
8123   format %{ "vpternlogd $dst,$src2,$src3,$func\t! vector ternary logic" %}
8124   ins_encode %{
8125     int vector_len = vector_length_encoding(this);
8126     __ vpternlogd($dst$$XMMRegister, $func$$constant, $src2$$XMMRegister, $src3$$Address, vector_len);
8127   %}
8128   ins_pipe( pipe_slow );
8129 %}
8130 
8131 // --------------------------------- Rotation Operations ----------------------------------
8132 instruct vprotate_immI8(vec dst, vec src, immI8 shift) %{
8133   match(Set dst (RotateLeftV src shift));
8134   match(Set dst (RotateRightV src shift));
8135   format %{ "vprotate_imm8 $dst,$src,$shift\t! vector rotate" %}
8136   ins_encode %{
8137     int opcode      = this->ideal_Opcode();
8138     int vector_len  = vector_length_encoding(this);
8139     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8140     __ vprotate_imm(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$constant, vector_len);
8141   %}
8142   ins_pipe( pipe_slow );
8143 %}
8144 
8145 instruct vprorate(vec dst, vec src, vec shift) %{
8146   match(Set dst (RotateLeftV src shift));
8147   match(Set dst (RotateRightV src shift));
8148   format %{ "vprotate $dst,$src,$shift\t! vector rotate" %}
8149   ins_encode %{
8150     int opcode      = this->ideal_Opcode();
8151     int vector_len  = vector_length_encoding(this);
8152     BasicType etype = this->bottom_type()->is_vect()->element_basic_type();
8153     __ vprotate_var(opcode, etype, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8154   %}
8155   ins_pipe( pipe_slow );
8156 %}
8157 
8158 #ifdef _LP64
8159 // ---------------------------------- Masked Operations ------------------------------------
8160 
8161 instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
8162   match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
8163   effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
8164   format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
8165   ins_encode %{
8166     assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
8167     assert(Matcher::vector_element_basic_type(this, $src1) == Matcher::vector_element_basic_type(this, $src2), "mismatch");
8168 
8169     Label DONE;
8170     int vlen_enc = vector_length_encoding(this, $src1);
8171     BasicType elem_bt = Matcher::vector_element_basic_type(this, $src1);
8172 
8173     __ knotql($ktmp2$$KRegister, $mask$$KRegister);
8174     __ mov64($dst$$Register, -1L);
8175     __ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
8176     __ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
8177     __ jccb(Assembler::carrySet, DONE);
8178     __ kmovql($dst$$Register, $ktmp1$$KRegister);
8179     __ notq($dst$$Register);
8180     __ tzcntq($dst$$Register, $dst$$Register);
8181     __ bind(DONE);
8182   %}
8183   ins_pipe( pipe_slow );
8184 %}
8185 
8186 
8187 instruct vmasked_load64(vec dst, memory mem, kReg mask) %{
8188   match(Set dst (LoadVectorMasked mem mask));
8189   format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
8190   ins_encode %{
8191     BasicType elmType =  this->bottom_type()->is_vect()->element_basic_type();
8192     int vector_len = vector_length_encoding(this);
8193     __ evmovdqu(elmType, $mask$$KRegister, $dst$$XMMRegister, $mem$$Address, vector_len);
8194   %}
8195   ins_pipe( pipe_slow );
8196 %}
8197 
8198 instruct vmask_gen(kReg dst, rRegL len, rRegL temp) %{
8199   match(Set dst (VectorMaskGen len));
8200   effect(TEMP temp);
8201   format %{ "vector_mask_gen32 $dst, $len \t! vector mask generator" %}
8202   ins_encode %{
8203     __ genmask($dst$$KRegister, $len$$Register, $temp$$Register);
8204   %}
8205   ins_pipe( pipe_slow );
8206 %}
8207 
8208 instruct vmask_gen_imm(kReg dst, immL len, rRegL temp) %{
8209   match(Set dst (VectorMaskGen len));
8210   format %{ "vector_mask_gen $len \t! vector mask generator" %}
8211   effect(TEMP temp);
8212   ins_encode %{
8213     __ mov64($temp$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
8214     __ kmovql($dst$$KRegister, $temp$$Register);
8215   %}
8216   ins_pipe( pipe_slow );
8217 %}
8218 
8219 instruct vmasked_store64(memory mem, vec src, kReg mask) %{
8220   match(Set mem (StoreVectorMasked mem (Binary src mask)));
8221   format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
8222   ins_encode %{
8223     const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
8224     BasicType elmType =  src_node->bottom_type()->is_vect()->element_basic_type();
8225     int vector_len = vector_length_encoding(src_node);
8226     __ evmovdqu(elmType, $mask$$KRegister, $mem$$Address, $src$$XMMRegister, vector_len);
8227   %}
8228   ins_pipe( pipe_slow );
8229 %}
8230 
8231 instruct vmask_truecount_evex(rRegI dst, vec mask, rRegL tmp, kReg ktmp, vec xtmp) %{
8232   predicate(VM_Version::supports_avx512vlbw());
8233   match(Set dst (VectorMaskTrueCount mask));
8234   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp, TEMP xtmp);
8235   format %{ "vector_truecount_evex $mask \t! vector mask true count" %}
8236   ins_encode %{
8237     int opcode = this->ideal_Opcode();
8238     int vlen_enc = vector_length_encoding(this, $mask);
8239     int mask_len = Matcher::vector_length(this, $mask);
8240     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8241                              $tmp$$Register, $ktmp$$KRegister, mask_len, vlen_enc);
8242   %}
8243   ins_pipe( pipe_slow );
8244 %}
8245 
8246 instruct vmask_first_or_last_true_evex(rRegI dst, vec mask, rRegL tmp, kReg ktmp, vec xtmp, rFlagsReg cr) %{
8247   predicate(VM_Version::supports_avx512vlbw());
8248   match(Set dst (VectorMaskFirstTrue mask));
8249   match(Set dst (VectorMaskLastTrue mask));
8250   effect(TEMP_DEF dst, TEMP tmp, TEMP ktmp, TEMP xtmp, KILL cr);
8251   format %{ "vector_mask_first_or_last_true_evex $mask \t! vector first/last true location" %}
8252   ins_encode %{
8253     int opcode = this->ideal_Opcode();
8254     int vlen_enc = vector_length_encoding(this, $mask);
8255     int mask_len = Matcher::vector_length(this, $mask);
8256     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8257                              $tmp$$Register, $ktmp$$KRegister, mask_len, vlen_enc);
8258   %}
8259   ins_pipe( pipe_slow );
8260 %}
8261 
8262 instruct vmask_truecount_avx(rRegI dst, vec mask, rRegL tmp, vec xtmp, vec xtmp1) %{
8263   predicate(!VM_Version::supports_avx512vlbw());
8264   match(Set dst (VectorMaskTrueCount mask));
8265   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, TEMP xtmp1);
8266   format %{ "vector_truecount_avx $mask \t! vector mask true count" %}
8267   ins_encode %{
8268     int opcode = this->ideal_Opcode();
8269     int vlen_enc = vector_length_encoding(this, $mask);
8270     int mask_len = Matcher::vector_length(this, $mask);
8271     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8272                              $xtmp1$$XMMRegister, $tmp$$Register, mask_len, vlen_enc);
8273   %}
8274   ins_pipe( pipe_slow );
8275 %}
8276 
8277 instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, rRegL tmp, vec xtmp, vec xtmp1, rFlagsReg cr) %{
8278   predicate(!VM_Version::supports_avx512vlbw());
8279   match(Set dst (VectorMaskFirstTrue mask));
8280   match(Set dst (VectorMaskLastTrue mask));
8281   effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, TEMP xtmp1, KILL cr);
8282   format %{ "vector_mask_first_or_last_true_avx $mask \t! vector first/last true location" %}
8283   ins_encode %{
8284     int opcode = this->ideal_Opcode();
8285     int vlen_enc = vector_length_encoding(this, $mask);
8286     int mask_len = Matcher::vector_length(this, $mask);
8287     __ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
8288                              $xtmp1$$XMMRegister, $tmp$$Register, mask_len, vlen_enc);
8289   %}
8290   ins_pipe( pipe_slow );
8291 %}
8292 #endif // _LP64
8293 
8294 instruct castVV(vec dst)
8295 %{
8296   match(Set dst (CastVV dst));
8297 
8298   size(0);
8299   format %{ "# castVV of $dst" %}
8300   ins_encode(/* empty encoding */);
8301   ins_cost(0);
8302   ins_pipe(empty);
8303 %}
8304 
8305 instruct castVVLeg(legVec dst)
8306 %{
8307   match(Set dst (CastVV dst));
8308 
8309   size(0);
8310   format %{ "# castVV of $dst" %}
8311   ins_encode(/* empty encoding */);
8312   ins_cost(0);
8313   ins_pipe(empty);
8314 %}