1 //
   2 // Copyright (c) 2011, 2019, Oracle and/or its affiliates. All rights reserved.
   3 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4 //
   5 // This code is free software; you can redistribute it and/or modify it
   6 // under the terms of the GNU General Public License version 2 only, as
   7 // published by the Free Software Foundation.
   8 //
   9 // This code is distributed in the hope that it will be useful, but WITHOUT
  10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12 // version 2 for more details (a copy is included in the LICENSE file that
  13 // accompanied this code).
  14 //
  15 // You should have received a copy of the GNU General Public License version
  16 // 2 along with this work; if not, write to the Free Software Foundation,
  17 // Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18 //
  19 // Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20 // or visit www.oracle.com if you need additional information or have any
  21 // questions.
  22 //
  23 //
  24 
  25 // X86 Common Architecture Description File
  26 
  27 //----------REGISTER DEFINITION BLOCK------------------------------------------
  28 // This information is used by the matcher and the register allocator to
  29 // describe individual registers and classes of registers within the target
  30 // archtecture.
  31 
  32 register %{
  33 //----------Architecture Description Register Definitions----------------------
  34 // General Registers
  35 // "reg_def"  name ( register save type, C convention save type,
  36 //                   ideal register type, encoding );
  37 // Register Save Types:
  38 //
  39 // NS  = No-Save:       The register allocator assumes that these registers
  40 //                      can be used without saving upon entry to the method, &
  41 //                      that they do not need to be saved at call sites.
  42 //
  43 // SOC = Save-On-Call:  The register allocator assumes that these registers
  44 //                      can be used without saving upon entry to the method,
  45 //                      but that they must be saved at call sites.
  46 //
  47 // SOE = Save-On-Entry: The register allocator assumes that these registers
  48 //                      must be saved before using them upon entry to the
  49 //                      method, but they do not need to be saved at call
  50 //                      sites.
  51 //
  52 // AS  = Always-Save:   The register allocator assumes that these registers
  53 //                      must be saved before using them upon entry to the
  54 //                      method, & that they must be saved at call sites.
  55 //
  56 // Ideal Register Type is used to determine how to save & restore a
  57 // register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
  58 // spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
  59 //
  60 // The encoding number is the actual bit-pattern placed into the opcodes.
  61 
  62 // XMM registers.  512-bit registers or 8 words each, labeled (a)-p.
  63 // Word a in each register holds a Float, words ab hold a Double.
  64 // The whole registers are used in SSE4.2 version intrinsics,
  65 // array copy stubs and superword operations (see UseSSE42Intrinsics,
  66 // UseXMMForArrayCopy and UseSuperword flags).
  67 // For pre EVEX enabled architectures:
  68 //      XMM8-XMM15 must be encoded with REX (VEX for UseAVX)
  69 // For EVEX enabled architectures:
  70 //      XMM8-XMM31 must be encoded with REX (EVEX for UseAVX).
  71 //
  72 // Linux ABI:   No register preserved across function calls
  73 //              XMM0-XMM7 might hold parameters
  74 // Windows ABI: XMM6-XMM31 preserved across function calls
  75 //              XMM0-XMM3 might hold parameters
  76 
  77 reg_def XMM0 ( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg());
  78 reg_def XMM0b( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(1));
  79 reg_def XMM0c( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(2));
  80 reg_def XMM0d( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(3));
  81 reg_def XMM0e( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(4));
  82 reg_def XMM0f( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(5));
  83 reg_def XMM0g( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(6));
  84 reg_def XMM0h( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(7));
  85 reg_def XMM0i( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(8));
  86 reg_def XMM0j( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(9));
  87 reg_def XMM0k( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(10));
  88 reg_def XMM0l( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(11));
  89 reg_def XMM0m( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(12));
  90 reg_def XMM0n( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(13));
  91 reg_def XMM0o( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(14));
  92 reg_def XMM0p( SOC, SOC, Op_RegF, 0, xmm0->as_VMReg()->next(15));
  93 
  94 reg_def XMM1 ( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg());
  95 reg_def XMM1b( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(1));
  96 reg_def XMM1c( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(2));
  97 reg_def XMM1d( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(3));
  98 reg_def XMM1e( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(4));
  99 reg_def XMM1f( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(5));
 100 reg_def XMM1g( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(6));
 101 reg_def XMM1h( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(7));
 102 reg_def XMM1i( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(8));
 103 reg_def XMM1j( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(9));
 104 reg_def XMM1k( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(10));
 105 reg_def XMM1l( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(11));
 106 reg_def XMM1m( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(12));
 107 reg_def XMM1n( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(13));
 108 reg_def XMM1o( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(14));
 109 reg_def XMM1p( SOC, SOC, Op_RegF, 1, xmm1->as_VMReg()->next(15));
 110 
 111 reg_def XMM2 ( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg());
 112 reg_def XMM2b( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(1));
 113 reg_def XMM2c( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(2));
 114 reg_def XMM2d( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(3));
 115 reg_def XMM2e( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(4));
 116 reg_def XMM2f( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(5));
 117 reg_def XMM2g( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(6));
 118 reg_def XMM2h( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(7));
 119 reg_def XMM2i( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(8));
 120 reg_def XMM2j( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(9));
 121 reg_def XMM2k( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(10));
 122 reg_def XMM2l( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(11));
 123 reg_def XMM2m( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(12));
 124 reg_def XMM2n( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(13));
 125 reg_def XMM2o( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(14));
 126 reg_def XMM2p( SOC, SOC, Op_RegF, 2, xmm2->as_VMReg()->next(15));
 127 
 128 reg_def XMM3 ( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg());
 129 reg_def XMM3b( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(1));
 130 reg_def XMM3c( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(2));
 131 reg_def XMM3d( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(3));
 132 reg_def XMM3e( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(4));
 133 reg_def XMM3f( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(5));
 134 reg_def XMM3g( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(6));
 135 reg_def XMM3h( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(7));
 136 reg_def XMM3i( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(8));
 137 reg_def XMM3j( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(9));
 138 reg_def XMM3k( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(10));
 139 reg_def XMM3l( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(11));
 140 reg_def XMM3m( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(12));
 141 reg_def XMM3n( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(13));
 142 reg_def XMM3o( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(14));
 143 reg_def XMM3p( SOC, SOC, Op_RegF, 3, xmm3->as_VMReg()->next(15));
 144 
 145 reg_def XMM4 ( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg());
 146 reg_def XMM4b( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(1));
 147 reg_def XMM4c( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(2));
 148 reg_def XMM4d( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(3));
 149 reg_def XMM4e( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(4));
 150 reg_def XMM4f( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(5));
 151 reg_def XMM4g( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(6));
 152 reg_def XMM4h( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(7));
 153 reg_def XMM4i( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(8));
 154 reg_def XMM4j( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(9));
 155 reg_def XMM4k( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(10));
 156 reg_def XMM4l( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(11));
 157 reg_def XMM4m( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(12));
 158 reg_def XMM4n( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(13));
 159 reg_def XMM4o( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(14));
 160 reg_def XMM4p( SOC, SOC, Op_RegF, 4, xmm4->as_VMReg()->next(15));
 161 
 162 reg_def XMM5 ( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg());
 163 reg_def XMM5b( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(1));
 164 reg_def XMM5c( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(2));
 165 reg_def XMM5d( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(3));
 166 reg_def XMM5e( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(4));
 167 reg_def XMM5f( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(5));
 168 reg_def XMM5g( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(6));
 169 reg_def XMM5h( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(7));
 170 reg_def XMM5i( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(8));
 171 reg_def XMM5j( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(9));
 172 reg_def XMM5k( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(10));
 173 reg_def XMM5l( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(11));
 174 reg_def XMM5m( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(12));
 175 reg_def XMM5n( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(13));
 176 reg_def XMM5o( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(14));
 177 reg_def XMM5p( SOC, SOC, Op_RegF, 5, xmm5->as_VMReg()->next(15));
 178 
 179 reg_def XMM6 ( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg());
 180 reg_def XMM6b( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(1));
 181 reg_def XMM6c( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(2));
 182 reg_def XMM6d( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(3));
 183 reg_def XMM6e( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(4));
 184 reg_def XMM6f( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(5));
 185 reg_def XMM6g( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(6));
 186 reg_def XMM6h( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(7));
 187 reg_def XMM6i( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(8));
 188 reg_def XMM6j( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(9));
 189 reg_def XMM6k( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(10));
 190 reg_def XMM6l( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(11));
 191 reg_def XMM6m( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(12));
 192 reg_def XMM6n( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(13));
 193 reg_def XMM6o( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(14));
 194 reg_def XMM6p( SOC, SOC, Op_RegF, 6, xmm6->as_VMReg()->next(15));
 195 
 196 reg_def XMM7 ( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg());
 197 reg_def XMM7b( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(1));
 198 reg_def XMM7c( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(2));
 199 reg_def XMM7d( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(3));
 200 reg_def XMM7e( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(4));
 201 reg_def XMM7f( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(5));
 202 reg_def XMM7g( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(6));
 203 reg_def XMM7h( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(7));
 204 reg_def XMM7i( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(8));
 205 reg_def XMM7j( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(9));
 206 reg_def XMM7k( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(10));
 207 reg_def XMM7l( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(11));
 208 reg_def XMM7m( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(12));
 209 reg_def XMM7n( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(13));
 210 reg_def XMM7o( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(14));
 211 reg_def XMM7p( SOC, SOC, Op_RegF, 7, xmm7->as_VMReg()->next(15));
 212 
 213 #ifdef _LP64
 214 
 215 reg_def XMM8 ( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg());
 216 reg_def XMM8b( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(1));
 217 reg_def XMM8c( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(2));
 218 reg_def XMM8d( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(3));
 219 reg_def XMM8e( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(4));
 220 reg_def XMM8f( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(5));
 221 reg_def XMM8g( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(6));
 222 reg_def XMM8h( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(7));
 223 reg_def XMM8i( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(8));
 224 reg_def XMM8j( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(9));
 225 reg_def XMM8k( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(10));
 226 reg_def XMM8l( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(11));
 227 reg_def XMM8m( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(12));
 228 reg_def XMM8n( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(13));
 229 reg_def XMM8o( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(14));
 230 reg_def XMM8p( SOC, SOC, Op_RegF, 8, xmm8->as_VMReg()->next(15));
 231 
 232 reg_def XMM9 ( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg());
 233 reg_def XMM9b( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(1));
 234 reg_def XMM9c( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(2));
 235 reg_def XMM9d( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(3));
 236 reg_def XMM9e( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(4));
 237 reg_def XMM9f( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(5));
 238 reg_def XMM9g( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(6));
 239 reg_def XMM9h( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(7));
 240 reg_def XMM9i( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(8));
 241 reg_def XMM9j( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(9));
 242 reg_def XMM9k( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(10));
 243 reg_def XMM9l( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(11));
 244 reg_def XMM9m( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(12));
 245 reg_def XMM9n( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(13));
 246 reg_def XMM9o( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(14));
 247 reg_def XMM9p( SOC, SOC, Op_RegF, 9, xmm9->as_VMReg()->next(15));
 248 
 249 reg_def XMM10 ( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg());
 250 reg_def XMM10b( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(1));
 251 reg_def XMM10c( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(2));
 252 reg_def XMM10d( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(3));
 253 reg_def XMM10e( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(4));
 254 reg_def XMM10f( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(5));
 255 reg_def XMM10g( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(6));
 256 reg_def XMM10h( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(7));
 257 reg_def XMM10i( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(8));
 258 reg_def XMM10j( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(9));
 259 reg_def XMM10k( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(10));
 260 reg_def XMM10l( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(11));
 261 reg_def XMM10m( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(12));
 262 reg_def XMM10n( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(13));
 263 reg_def XMM10o( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(14));
 264 reg_def XMM10p( SOC, SOC, Op_RegF, 10, xmm10->as_VMReg()->next(15));
 265 
 266 reg_def XMM11 ( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg());
 267 reg_def XMM11b( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(1));
 268 reg_def XMM11c( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(2));
 269 reg_def XMM11d( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(3));
 270 reg_def XMM11e( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(4));
 271 reg_def XMM11f( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(5));
 272 reg_def XMM11g( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(6));
 273 reg_def XMM11h( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(7));
 274 reg_def XMM11i( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(8));
 275 reg_def XMM11j( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(9));
 276 reg_def XMM11k( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(10));
 277 reg_def XMM11l( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(11));
 278 reg_def XMM11m( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(12));
 279 reg_def XMM11n( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(13));
 280 reg_def XMM11o( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(14));
 281 reg_def XMM11p( SOC, SOC, Op_RegF, 11, xmm11->as_VMReg()->next(15));
 282 
 283 reg_def XMM12 ( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg());
 284 reg_def XMM12b( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(1));
 285 reg_def XMM12c( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(2));
 286 reg_def XMM12d( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(3));
 287 reg_def XMM12e( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(4));
 288 reg_def XMM12f( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(5));
 289 reg_def XMM12g( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(6));
 290 reg_def XMM12h( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(7));
 291 reg_def XMM12i( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(8));
 292 reg_def XMM12j( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(9));
 293 reg_def XMM12k( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(10));
 294 reg_def XMM12l( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(11));
 295 reg_def XMM12m( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(12));
 296 reg_def XMM12n( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(13));
 297 reg_def XMM12o( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(14));
 298 reg_def XMM12p( SOC, SOC, Op_RegF, 12, xmm12->as_VMReg()->next(15));
 299 
 300 reg_def XMM13 ( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg());
 301 reg_def XMM13b( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(1));
 302 reg_def XMM13c( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(2));
 303 reg_def XMM13d( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(3));
 304 reg_def XMM13e( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(4));
 305 reg_def XMM13f( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(5));
 306 reg_def XMM13g( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(6));
 307 reg_def XMM13h( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(7));
 308 reg_def XMM13i( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(8));
 309 reg_def XMM13j( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(9));
 310 reg_def XMM13k( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(10));
 311 reg_def XMM13l( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(11));
 312 reg_def XMM13m( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(12));
 313 reg_def XMM13n( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(13));
 314 reg_def XMM13o( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(14));
 315 reg_def XMM13p( SOC, SOC, Op_RegF, 13, xmm13->as_VMReg()->next(15));
 316 
 317 reg_def XMM14 ( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg());
 318 reg_def XMM14b( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(1));
 319 reg_def XMM14c( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(2));
 320 reg_def XMM14d( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(3));
 321 reg_def XMM14e( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(4));
 322 reg_def XMM14f( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(5));
 323 reg_def XMM14g( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(6));
 324 reg_def XMM14h( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(7));
 325 reg_def XMM14i( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(8));
 326 reg_def XMM14j( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(9));
 327 reg_def XMM14k( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(10));
 328 reg_def XMM14l( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(11));
 329 reg_def XMM14m( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(12));
 330 reg_def XMM14n( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(13));
 331 reg_def XMM14o( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(14));
 332 reg_def XMM14p( SOC, SOC, Op_RegF, 14, xmm14->as_VMReg()->next(15));
 333 
 334 reg_def XMM15 ( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg());
 335 reg_def XMM15b( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(1));
 336 reg_def XMM15c( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(2));
 337 reg_def XMM15d( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(3));
 338 reg_def XMM15e( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(4));
 339 reg_def XMM15f( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(5));
 340 reg_def XMM15g( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(6));
 341 reg_def XMM15h( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(7));
 342 reg_def XMM15i( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(8));
 343 reg_def XMM15j( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(9));
 344 reg_def XMM15k( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(10));
 345 reg_def XMM15l( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(11));
 346 reg_def XMM15m( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(12));
 347 reg_def XMM15n( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(13));
 348 reg_def XMM15o( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(14));
 349 reg_def XMM15p( SOC, SOC, Op_RegF, 15, xmm15->as_VMReg()->next(15));
 350 
 351 reg_def XMM16 ( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg());
 352 reg_def XMM16b( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(1));
 353 reg_def XMM16c( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(2));
 354 reg_def XMM16d( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(3));
 355 reg_def XMM16e( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(4));
 356 reg_def XMM16f( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(5));
 357 reg_def XMM16g( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(6));
 358 reg_def XMM16h( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(7));
 359 reg_def XMM16i( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(8));
 360 reg_def XMM16j( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(9));
 361 reg_def XMM16k( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(10));
 362 reg_def XMM16l( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(11));
 363 reg_def XMM16m( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(12));
 364 reg_def XMM16n( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(13));
 365 reg_def XMM16o( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(14));
 366 reg_def XMM16p( SOC, SOC, Op_RegF, 16, xmm16->as_VMReg()->next(15));
 367 
 368 reg_def XMM17 ( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg());
 369 reg_def XMM17b( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(1));
 370 reg_def XMM17c( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(2));
 371 reg_def XMM17d( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(3));
 372 reg_def XMM17e( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(4));
 373 reg_def XMM17f( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(5));
 374 reg_def XMM17g( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(6));
 375 reg_def XMM17h( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(7));
 376 reg_def XMM17i( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(8));
 377 reg_def XMM17j( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(9));
 378 reg_def XMM17k( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(10));
 379 reg_def XMM17l( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(11));
 380 reg_def XMM17m( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(12));
 381 reg_def XMM17n( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(13));
 382 reg_def XMM17o( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(14));
 383 reg_def XMM17p( SOC, SOC, Op_RegF, 17, xmm17->as_VMReg()->next(15));
 384 
 385 reg_def XMM18 ( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg());
 386 reg_def XMM18b( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(1));
 387 reg_def XMM18c( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(2));
 388 reg_def XMM18d( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(3));
 389 reg_def XMM18e( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(4));
 390 reg_def XMM18f( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(5));
 391 reg_def XMM18g( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(6));
 392 reg_def XMM18h( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(7));
 393 reg_def XMM18i( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(8));
 394 reg_def XMM18j( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(9));
 395 reg_def XMM18k( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(10));
 396 reg_def XMM18l( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(11));
 397 reg_def XMM18m( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(12));
 398 reg_def XMM18n( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(13));
 399 reg_def XMM18o( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(14));
 400 reg_def XMM18p( SOC, SOC, Op_RegF, 18, xmm18->as_VMReg()->next(15));
 401 
 402 reg_def XMM19 ( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg());
 403 reg_def XMM19b( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(1));
 404 reg_def XMM19c( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(2));
 405 reg_def XMM19d( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(3));
 406 reg_def XMM19e( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(4));
 407 reg_def XMM19f( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(5));
 408 reg_def XMM19g( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(6));
 409 reg_def XMM19h( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(7));
 410 reg_def XMM19i( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(8));
 411 reg_def XMM19j( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(9));
 412 reg_def XMM19k( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(10));
 413 reg_def XMM19l( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(11));
 414 reg_def XMM19m( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(12));
 415 reg_def XMM19n( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(13));
 416 reg_def XMM19o( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(14));
 417 reg_def XMM19p( SOC, SOC, Op_RegF, 19, xmm19->as_VMReg()->next(15));
 418 
 419 reg_def XMM20 ( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg());
 420 reg_def XMM20b( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(1));
 421 reg_def XMM20c( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(2));
 422 reg_def XMM20d( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(3));
 423 reg_def XMM20e( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(4));
 424 reg_def XMM20f( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(5));
 425 reg_def XMM20g( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(6));
 426 reg_def XMM20h( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(7));
 427 reg_def XMM20i( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(8));
 428 reg_def XMM20j( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(9));
 429 reg_def XMM20k( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(10));
 430 reg_def XMM20l( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(11));
 431 reg_def XMM20m( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(12));
 432 reg_def XMM20n( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(13));
 433 reg_def XMM20o( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(14));
 434 reg_def XMM20p( SOC, SOC, Op_RegF, 20, xmm20->as_VMReg()->next(15));
 435 
 436 reg_def XMM21 ( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg());
 437 reg_def XMM21b( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(1));
 438 reg_def XMM21c( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(2));
 439 reg_def XMM21d( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(3));
 440 reg_def XMM21e( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(4));
 441 reg_def XMM21f( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(5));
 442 reg_def XMM21g( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(6));
 443 reg_def XMM21h( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(7));
 444 reg_def XMM21i( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(8));
 445 reg_def XMM21j( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(9));
 446 reg_def XMM21k( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(10));
 447 reg_def XMM21l( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(11));
 448 reg_def XMM21m( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(12));
 449 reg_def XMM21n( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(13));
 450 reg_def XMM21o( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(14));
 451 reg_def XMM21p( SOC, SOC, Op_RegF, 21, xmm21->as_VMReg()->next(15));
 452 
 453 reg_def XMM22 ( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg());
 454 reg_def XMM22b( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(1));
 455 reg_def XMM22c( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(2));
 456 reg_def XMM22d( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(3));
 457 reg_def XMM22e( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(4));
 458 reg_def XMM22f( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(5));
 459 reg_def XMM22g( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(6));
 460 reg_def XMM22h( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(7));
 461 reg_def XMM22i( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(8));
 462 reg_def XMM22j( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(9));
 463 reg_def XMM22k( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(10));
 464 reg_def XMM22l( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(11));
 465 reg_def XMM22m( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(12));
 466 reg_def XMM22n( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(13));
 467 reg_def XMM22o( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(14));
 468 reg_def XMM22p( SOC, SOC, Op_RegF, 22, xmm22->as_VMReg()->next(15));
 469 
 470 reg_def XMM23 ( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg());
 471 reg_def XMM23b( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(1));
 472 reg_def XMM23c( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(2));
 473 reg_def XMM23d( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(3));
 474 reg_def XMM23e( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(4));
 475 reg_def XMM23f( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(5));
 476 reg_def XMM23g( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(6));
 477 reg_def XMM23h( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(7));
 478 reg_def XMM23i( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(8));
 479 reg_def XMM23j( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(9));
 480 reg_def XMM23k( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(10));
 481 reg_def XMM23l( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(11));
 482 reg_def XMM23m( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(12));
 483 reg_def XMM23n( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(13));
 484 reg_def XMM23o( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(14));
 485 reg_def XMM23p( SOC, SOC, Op_RegF, 23, xmm23->as_VMReg()->next(15));
 486 
 487 reg_def XMM24 ( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg());
 488 reg_def XMM24b( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(1));
 489 reg_def XMM24c( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(2));
 490 reg_def XMM24d( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(3));
 491 reg_def XMM24e( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(4));
 492 reg_def XMM24f( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(5));
 493 reg_def XMM24g( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(6));
 494 reg_def XMM24h( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(7));
 495 reg_def XMM24i( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(8));
 496 reg_def XMM24j( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(9));
 497 reg_def XMM24k( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(10));
 498 reg_def XMM24l( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(11));
 499 reg_def XMM24m( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(12));
 500 reg_def XMM24n( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(13));
 501 reg_def XMM24o( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(14));
 502 reg_def XMM24p( SOC, SOC, Op_RegF, 24, xmm24->as_VMReg()->next(15));
 503 
 504 reg_def XMM25 ( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg());
 505 reg_def XMM25b( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(1));
 506 reg_def XMM25c( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(2));
 507 reg_def XMM25d( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(3));
 508 reg_def XMM25e( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(4));
 509 reg_def XMM25f( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(5));
 510 reg_def XMM25g( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(6));
 511 reg_def XMM25h( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(7));
 512 reg_def XMM25i( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(8));
 513 reg_def XMM25j( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(9));
 514 reg_def XMM25k( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(10));
 515 reg_def XMM25l( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(11));
 516 reg_def XMM25m( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(12));
 517 reg_def XMM25n( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(13));
 518 reg_def XMM25o( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(14));
 519 reg_def XMM25p( SOC, SOC, Op_RegF, 25, xmm25->as_VMReg()->next(15));
 520 
 521 reg_def XMM26 ( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg());
 522 reg_def XMM26b( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(1));
 523 reg_def XMM26c( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(2));
 524 reg_def XMM26d( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(3));
 525 reg_def XMM26e( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(4));
 526 reg_def XMM26f( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(5));
 527 reg_def XMM26g( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(6));
 528 reg_def XMM26h( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(7));
 529 reg_def XMM26i( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(8));
 530 reg_def XMM26j( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(9));
 531 reg_def XMM26k( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(10));
 532 reg_def XMM26l( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(11));
 533 reg_def XMM26m( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(12));
 534 reg_def XMM26n( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(13));
 535 reg_def XMM26o( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(14));
 536 reg_def XMM26p( SOC, SOC, Op_RegF, 26, xmm26->as_VMReg()->next(15));
 537 
 538 reg_def XMM27 ( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg());
 539 reg_def XMM27b( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(1));
 540 reg_def XMM27c( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(2));
 541 reg_def XMM27d( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(3));
 542 reg_def XMM27e( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(4));
 543 reg_def XMM27f( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(5));
 544 reg_def XMM27g( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(6));
 545 reg_def XMM27h( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(7));
 546 reg_def XMM27i( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(8));
 547 reg_def XMM27j( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(9));
 548 reg_def XMM27k( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(10));
 549 reg_def XMM27l( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(11));
 550 reg_def XMM27m( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(12));
 551 reg_def XMM27n( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(13));
 552 reg_def XMM27o( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(14));
 553 reg_def XMM27p( SOC, SOC, Op_RegF, 27, xmm27->as_VMReg()->next(15));
 554 
 555 reg_def XMM28 ( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg());
 556 reg_def XMM28b( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(1));
 557 reg_def XMM28c( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(2));
 558 reg_def XMM28d( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(3));
 559 reg_def XMM28e( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(4));
 560 reg_def XMM28f( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(5));
 561 reg_def XMM28g( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(6));
 562 reg_def XMM28h( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(7));
 563 reg_def XMM28i( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(8));
 564 reg_def XMM28j( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(9));
 565 reg_def XMM28k( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(10));
 566 reg_def XMM28l( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(11));
 567 reg_def XMM28m( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(12));
 568 reg_def XMM28n( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(13));
 569 reg_def XMM28o( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(14));
 570 reg_def XMM28p( SOC, SOC, Op_RegF, 28, xmm28->as_VMReg()->next(15));
 571 
 572 reg_def XMM29 ( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg());
 573 reg_def XMM29b( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(1));
 574 reg_def XMM29c( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(2));
 575 reg_def XMM29d( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(3));
 576 reg_def XMM29e( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(4));
 577 reg_def XMM29f( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(5));
 578 reg_def XMM29g( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(6));
 579 reg_def XMM29h( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(7));
 580 reg_def XMM29i( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(8));
 581 reg_def XMM29j( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(9));
 582 reg_def XMM29k( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(10));
 583 reg_def XMM29l( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(11));
 584 reg_def XMM29m( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(12));
 585 reg_def XMM29n( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(13));
 586 reg_def XMM29o( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(14));
 587 reg_def XMM29p( SOC, SOC, Op_RegF, 29, xmm29->as_VMReg()->next(15));
 588 
 589 reg_def XMM30 ( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg());
 590 reg_def XMM30b( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(1));
 591 reg_def XMM30c( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(2));
 592 reg_def XMM30d( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(3));
 593 reg_def XMM30e( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(4));
 594 reg_def XMM30f( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(5));
 595 reg_def XMM30g( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(6));
 596 reg_def XMM30h( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(7));
 597 reg_def XMM30i( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(8));
 598 reg_def XMM30j( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(9));
 599 reg_def XMM30k( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(10));
 600 reg_def XMM30l( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(11));
 601 reg_def XMM30m( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(12));
 602 reg_def XMM30n( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(13));
 603 reg_def XMM30o( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(14));
 604 reg_def XMM30p( SOC, SOC, Op_RegF, 30, xmm30->as_VMReg()->next(15));
 605 
 606 reg_def XMM31 ( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg());
 607 reg_def XMM31b( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(1));
 608 reg_def XMM31c( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(2));
 609 reg_def XMM31d( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(3));
 610 reg_def XMM31e( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(4));
 611 reg_def XMM31f( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(5));
 612 reg_def XMM31g( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(6));
 613 reg_def XMM31h( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(7));
 614 reg_def XMM31i( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(8));
 615 reg_def XMM31j( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(9));
 616 reg_def XMM31k( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(10));
 617 reg_def XMM31l( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(11));
 618 reg_def XMM31m( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(12));
 619 reg_def XMM31n( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(13));
 620 reg_def XMM31o( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(14));
 621 reg_def XMM31p( SOC, SOC, Op_RegF, 31, xmm31->as_VMReg()->next(15));
 622 
 623 #endif // _LP64
 624 
 625 #ifdef _LP64
 626 reg_def RFLAGS(SOC, SOC, 0, 16, VMRegImpl::Bad());
 627 #else
 628 reg_def RFLAGS(SOC, SOC, 0, 8, VMRegImpl::Bad());
 629 #endif // _LP64
 630 
 631 alloc_class chunk1(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
 632                    XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
 633                    XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
 634                    XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
 635                    XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
 636                    XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
 637                    XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
 638                    XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
 639 #ifdef _LP64
 640                   ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
 641                    XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
 642                    XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
 643                    XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
 644                    XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
 645                    XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
 646                    XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
 647                    XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
 648                   ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
 649                    XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
 650                    XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
 651                    XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
 652                    XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
 653                    XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
 654                    XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
 655                    XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
 656                    XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
 657                    XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
 658                    XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
 659                    XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
 660                    XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
 661                    XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
 662                    XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
 663                    XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
 664 #endif
 665                       );
 666 
 667 // flags allocation class should be last.
 668 alloc_class chunk2(RFLAGS);
 669 
 670 // Singleton class for condition codes
 671 reg_class int_flags(RFLAGS);
 672 
 673 // Class for pre evex float registers
 674 reg_class float_reg_legacy(XMM0,
 675                     XMM1,
 676                     XMM2,
 677                     XMM3,
 678                     XMM4,
 679                     XMM5,
 680                     XMM6,
 681                     XMM7
 682 #ifdef _LP64
 683                    ,XMM8,
 684                     XMM9,
 685                     XMM10,
 686                     XMM11,
 687                     XMM12,
 688                     XMM13,
 689                     XMM14,
 690                     XMM15
 691 #endif
 692                     );
 693 
 694 // Class for evex float registers
 695 reg_class float_reg_evex(XMM0,
 696                     XMM1,
 697                     XMM2,
 698                     XMM3,
 699                     XMM4,
 700                     XMM5,
 701                     XMM6,
 702                     XMM7
 703 #ifdef _LP64
 704                    ,XMM8,
 705                     XMM9,
 706                     XMM10,
 707                     XMM11,
 708                     XMM12,
 709                     XMM13,
 710                     XMM14,
 711                     XMM15,
 712                     XMM16,
 713                     XMM17,
 714                     XMM18,
 715                     XMM19,
 716                     XMM20,
 717                     XMM21,
 718                     XMM22,
 719                     XMM23,
 720                     XMM24,
 721                     XMM25,
 722                     XMM26,
 723                     XMM27,
 724                     XMM28,
 725                     XMM29,
 726                     XMM30,
 727                     XMM31
 728 #endif
 729                     );
 730 
 731 reg_class_dynamic float_reg(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() %} );
 732 reg_class_dynamic float_reg_vl(float_reg_evex, float_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 733 
 734 // Class for pre evex double registers
 735 reg_class double_reg_legacy(XMM0,  XMM0b,
 736                      XMM1,  XMM1b,
 737                      XMM2,  XMM2b,
 738                      XMM3,  XMM3b,
 739                      XMM4,  XMM4b,
 740                      XMM5,  XMM5b,
 741                      XMM6,  XMM6b,
 742                      XMM7,  XMM7b
 743 #ifdef _LP64
 744                     ,XMM8,  XMM8b,
 745                      XMM9,  XMM9b,
 746                      XMM10, XMM10b,
 747                      XMM11, XMM11b,
 748                      XMM12, XMM12b,
 749                      XMM13, XMM13b,
 750                      XMM14, XMM14b,
 751                      XMM15, XMM15b
 752 #endif
 753                      );
 754 
 755 // Class for evex double registers
 756 reg_class double_reg_evex(XMM0,  XMM0b,
 757                      XMM1,  XMM1b,
 758                      XMM2,  XMM2b,
 759                      XMM3,  XMM3b,
 760                      XMM4,  XMM4b,
 761                      XMM5,  XMM5b,
 762                      XMM6,  XMM6b,
 763                      XMM7,  XMM7b
 764 #ifdef _LP64
 765                     ,XMM8,  XMM8b,
 766                      XMM9,  XMM9b,
 767                      XMM10, XMM10b,
 768                      XMM11, XMM11b,
 769                      XMM12, XMM12b,
 770                      XMM13, XMM13b,
 771                      XMM14, XMM14b,
 772                      XMM15, XMM15b,
 773                      XMM16, XMM16b,
 774                      XMM17, XMM17b,
 775                      XMM18, XMM18b,
 776                      XMM19, XMM19b,
 777                      XMM20, XMM20b,
 778                      XMM21, XMM21b,
 779                      XMM22, XMM22b,
 780                      XMM23, XMM23b,
 781                      XMM24, XMM24b,
 782                      XMM25, XMM25b,
 783                      XMM26, XMM26b,
 784                      XMM27, XMM27b,
 785                      XMM28, XMM28b,
 786                      XMM29, XMM29b,
 787                      XMM30, XMM30b,
 788                      XMM31, XMM31b
 789 #endif
 790                      );
 791 
 792 reg_class_dynamic double_reg(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() %} );
 793 reg_class_dynamic double_reg_vl(double_reg_evex, double_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
 794 
 795 // Class for pre evex 32bit vector registers
 796 reg_class vectors_reg_legacy(XMM0,
 797                       XMM1,
 798                       XMM2,
 799                       XMM3,
 800                       XMM4,
 801                       XMM5,
 802                       XMM6,
 803                       XMM7
 804 #ifdef _LP64
 805                      ,XMM8,
 806                       XMM9,
 807                       XMM10,
 808                       XMM11,
 809                       XMM12,
 810                       XMM13,
 811                       XMM14,
 812                       XMM15
 813 #endif
 814                       );
 815 
 816 // Class for evex 32bit vector registers
 817 reg_class vectors_reg_evex(XMM0,
 818                       XMM1,
 819                       XMM2,
 820                       XMM3,
 821                       XMM4,
 822                       XMM5,
 823                       XMM6,
 824                       XMM7
 825 #ifdef _LP64
 826                      ,XMM8,
 827                       XMM9,
 828                       XMM10,
 829                       XMM11,
 830                       XMM12,
 831                       XMM13,
 832                       XMM14,
 833                       XMM15,
 834                       XMM16,
 835                       XMM17,
 836                       XMM18,
 837                       XMM19,
 838                       XMM20,
 839                       XMM21,
 840                       XMM22,
 841                       XMM23,
 842                       XMM24,
 843                       XMM25,
 844                       XMM26,
 845                       XMM27,
 846                       XMM28,
 847                       XMM29,
 848                       XMM30,
 849                       XMM31
 850 #endif
 851                       );
 852 
 853 reg_class_dynamic vectors_reg(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_evex() %} );
 854 reg_class_dynamic vectors_reg_vlbwdq(vectors_reg_evex, vectors_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 855 
 856 // Class for all 64bit vector registers
 857 reg_class vectord_reg_legacy(XMM0,  XMM0b,
 858                       XMM1,  XMM1b,
 859                       XMM2,  XMM2b,
 860                       XMM3,  XMM3b,
 861                       XMM4,  XMM4b,
 862                       XMM5,  XMM5b,
 863                       XMM6,  XMM6b,
 864                       XMM7,  XMM7b
 865 #ifdef _LP64
 866                      ,XMM8,  XMM8b,
 867                       XMM9,  XMM9b,
 868                       XMM10, XMM10b,
 869                       XMM11, XMM11b,
 870                       XMM12, XMM12b,
 871                       XMM13, XMM13b,
 872                       XMM14, XMM14b,
 873                       XMM15, XMM15b
 874 #endif
 875                       );
 876 
 877 // Class for all 64bit vector registers
 878 reg_class vectord_reg_evex(XMM0,  XMM0b,
 879                       XMM1,  XMM1b,
 880                       XMM2,  XMM2b,
 881                       XMM3,  XMM3b,
 882                       XMM4,  XMM4b,
 883                       XMM5,  XMM5b,
 884                       XMM6,  XMM6b,
 885                       XMM7,  XMM7b
 886 #ifdef _LP64
 887                      ,XMM8,  XMM8b,
 888                       XMM9,  XMM9b,
 889                       XMM10, XMM10b,
 890                       XMM11, XMM11b,
 891                       XMM12, XMM12b,
 892                       XMM13, XMM13b,
 893                       XMM14, XMM14b,
 894                       XMM15, XMM15b,
 895                       XMM16, XMM16b,
 896                       XMM17, XMM17b,
 897                       XMM18, XMM18b,
 898                       XMM19, XMM19b,
 899                       XMM20, XMM20b,
 900                       XMM21, XMM21b,
 901                       XMM22, XMM22b,
 902                       XMM23, XMM23b,
 903                       XMM24, XMM24b,
 904                       XMM25, XMM25b,
 905                       XMM26, XMM26b,
 906                       XMM27, XMM27b,
 907                       XMM28, XMM28b,
 908                       XMM29, XMM29b,
 909                       XMM30, XMM30b,
 910                       XMM31, XMM31b
 911 #endif
 912                       );
 913 
 914 reg_class_dynamic vectord_reg(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_evex() %} );
 915 reg_class_dynamic vectord_reg_vlbwdq(vectord_reg_evex, vectord_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 916 
 917 // Class for all 128bit vector registers
 918 reg_class vectorx_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,
 919                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 920                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 921                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 922                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 923                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 924                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 925                       XMM7,  XMM7b,  XMM7c,  XMM7d
 926 #ifdef _LP64
 927                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 928                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 929                       XMM10, XMM10b, XMM10c, XMM10d,
 930                       XMM11, XMM11b, XMM11c, XMM11d,
 931                       XMM12, XMM12b, XMM12c, XMM12d,
 932                       XMM13, XMM13b, XMM13c, XMM13d,
 933                       XMM14, XMM14b, XMM14c, XMM14d,
 934                       XMM15, XMM15b, XMM15c, XMM15d
 935 #endif
 936                       );
 937 
 938 // Class for all 128bit vector registers
 939 reg_class vectorx_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,
 940                       XMM1,  XMM1b,  XMM1c,  XMM1d,
 941                       XMM2,  XMM2b,  XMM2c,  XMM2d,
 942                       XMM3,  XMM3b,  XMM3c,  XMM3d,
 943                       XMM4,  XMM4b,  XMM4c,  XMM4d,
 944                       XMM5,  XMM5b,  XMM5c,  XMM5d,
 945                       XMM6,  XMM6b,  XMM6c,  XMM6d,
 946                       XMM7,  XMM7b,  XMM7c,  XMM7d
 947 #ifdef _LP64
 948                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,
 949                       XMM9,  XMM9b,  XMM9c,  XMM9d,
 950                       XMM10, XMM10b, XMM10c, XMM10d,
 951                       XMM11, XMM11b, XMM11c, XMM11d,
 952                       XMM12, XMM12b, XMM12c, XMM12d,
 953                       XMM13, XMM13b, XMM13c, XMM13d,
 954                       XMM14, XMM14b, XMM14c, XMM14d,
 955                       XMM15, XMM15b, XMM15c, XMM15d,
 956                       XMM16, XMM16b, XMM16c, XMM16d,
 957                       XMM17, XMM17b, XMM17c, XMM17d,
 958                       XMM18, XMM18b, XMM18c, XMM18d,
 959                       XMM19, XMM19b, XMM19c, XMM19d,
 960                       XMM20, XMM20b, XMM20c, XMM20d,
 961                       XMM21, XMM21b, XMM21c, XMM21d,
 962                       XMM22, XMM22b, XMM22c, XMM22d,
 963                       XMM23, XMM23b, XMM23c, XMM23d,
 964                       XMM24, XMM24b, XMM24c, XMM24d,
 965                       XMM25, XMM25b, XMM25c, XMM25d,
 966                       XMM26, XMM26b, XMM26c, XMM26d,
 967                       XMM27, XMM27b, XMM27c, XMM27d,
 968                       XMM28, XMM28b, XMM28c, XMM28d,
 969                       XMM29, XMM29b, XMM29c, XMM29d,
 970                       XMM30, XMM30b, XMM30c, XMM30d,
 971                       XMM31, XMM31b, XMM31c, XMM31d
 972 #endif
 973                       );
 974 
 975 reg_class_dynamic vectorx_reg(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_evex() %} );
 976 reg_class_dynamic vectorx_reg_vlbwdq(vectorx_reg_evex, vectorx_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
 977 
 978 // Class for all 256bit vector registers
 979 reg_class vectory_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
 980                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
 981                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
 982                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
 983                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
 984                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
 985                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
 986                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
 987 #ifdef _LP64
 988                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
 989                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
 990                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
 991                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
 992                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
 993                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
 994                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
 995                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h
 996 #endif
 997                       );
 998 
 999 // Class for all 256bit vector registers
1000 reg_class vectory_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,
1001                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,
1002                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,
1003                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,
1004                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,
1005                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,
1006                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,
1007                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h
1008 #ifdef _LP64
1009                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,
1010                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,
1011                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h,
1012                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h,
1013                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h,
1014                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h,
1015                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h,
1016                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h,
1017                       XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h,
1018                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h,
1019                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h,
1020                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h,
1021                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h,
1022                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h,
1023                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h,
1024                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h,
1025                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h,
1026                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h,
1027                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h,
1028                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h,
1029                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h,
1030                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h,
1031                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h,
1032                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h
1033 #endif
1034                       );
1035 
1036 reg_class_dynamic vectory_reg(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_evex() %} );
1037 reg_class_dynamic vectory_reg_vlbwdq(vectory_reg_evex, vectory_reg_legacy, %{ VM_Version::supports_avx512vlbwdq() %} );
1038 
1039 // Class for all 512bit vector registers
1040 reg_class vectorz_reg_evex(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1041                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1042                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1043                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1044                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1045                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1046                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1047                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1048 #ifdef _LP64
1049                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1050                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1051                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1052                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1053                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1054                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1055                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1056                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1057                      ,XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p,
1058                       XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p,
1059                       XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p,
1060                       XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p,
1061                       XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p,
1062                       XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p,
1063                       XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p,
1064                       XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p,
1065                       XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p,
1066                       XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p,
1067                       XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p,
1068                       XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p,
1069                       XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p,
1070                       XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p,
1071                       XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p,
1072                       XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p
1073 #endif
1074                       );
1075 
1076 // Class for restricted 512bit vector registers
1077 reg_class vectorz_reg_legacy(XMM0,  XMM0b,  XMM0c,  XMM0d,  XMM0e,  XMM0f,  XMM0g,  XMM0h,  XMM0i,  XMM0j,  XMM0k,  XMM0l,  XMM0m,  XMM0n,  XMM0o,  XMM0p,
1078                       XMM1,  XMM1b,  XMM1c,  XMM1d,  XMM1e,  XMM1f,  XMM1g,  XMM1h,  XMM1i,  XMM1j,  XMM1k,  XMM1l,  XMM1m,  XMM1n,  XMM1o,  XMM1p,
1079                       XMM2,  XMM2b,  XMM2c,  XMM2d,  XMM2e,  XMM2f,  XMM2g,  XMM2h,  XMM2i,  XMM2j,  XMM2k,  XMM2l,  XMM2m,  XMM2n,  XMM2o,  XMM2p,
1080                       XMM3,  XMM3b,  XMM3c,  XMM3d,  XMM3e,  XMM3f,  XMM3g,  XMM3h,  XMM3i,  XMM3j,  XMM3k,  XMM3l,  XMM3m,  XMM3n,  XMM3o,  XMM3p,
1081                       XMM4,  XMM4b,  XMM4c,  XMM4d,  XMM4e,  XMM4f,  XMM4g,  XMM4h,  XMM4i,  XMM4j,  XMM4k,  XMM4l,  XMM4m,  XMM4n,  XMM4o,  XMM4p,
1082                       XMM5,  XMM5b,  XMM5c,  XMM5d,  XMM5e,  XMM5f,  XMM5g,  XMM5h,  XMM5i,  XMM5j,  XMM5k,  XMM5l,  XMM5m,  XMM5n,  XMM5o,  XMM5p,
1083                       XMM6,  XMM6b,  XMM6c,  XMM6d,  XMM6e,  XMM6f,  XMM6g,  XMM6h,  XMM6i,  XMM6j,  XMM6k,  XMM6l,  XMM6m,  XMM6n,  XMM6o,  XMM6p,
1084                       XMM7,  XMM7b,  XMM7c,  XMM7d,  XMM7e,  XMM7f,  XMM7g,  XMM7h,  XMM7i,  XMM7j,  XMM7k,  XMM7l,  XMM7m,  XMM7n,  XMM7o,  XMM7p
1085 #ifdef _LP64
1086                      ,XMM8,  XMM8b,  XMM8c,  XMM8d,  XMM8e,  XMM8f,  XMM8g,  XMM8h,  XMM8i,  XMM8j,  XMM8k,  XMM8l,  XMM8m,  XMM8n,  XMM8o,  XMM8p,
1087                       XMM9,  XMM9b,  XMM9c,  XMM9d,  XMM9e,  XMM9f,  XMM9g,  XMM9h,  XMM9i,  XMM9j,  XMM9k,  XMM9l,  XMM9m,  XMM9n,  XMM9o,  XMM9p,
1088                       XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p,
1089                       XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p,
1090                       XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p,
1091                       XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p,
1092                       XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p,
1093                       XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p
1094 #endif
1095                       );
1096 
1097 reg_class_dynamic vectorz_reg(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() %} );
1098 reg_class_dynamic vectorz_reg_vl(vectorz_reg_evex, vectorz_reg_legacy, %{ VM_Version::supports_evex() && VM_Version::supports_avx512vl() %} );
1099 
1100 reg_class xmm0_reg(XMM0, XMM0b, XMM0c, XMM0d);
1101 reg_class ymm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h);
1102 reg_class zmm0_reg(XMM0, XMM0b, XMM0c, XMM0d, XMM0e, XMM0f, XMM0g, XMM0h, XMM0i, XMM0j, XMM0k, XMM0l, XMM0m, XMM0n, XMM0o, XMM0p);
1103 
1104 reg_class xmm1_reg(XMM1, XMM1b, XMM1c, XMM1d);
1105 reg_class ymm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h);
1106 reg_class zmm1_reg(XMM1, XMM1b, XMM1c, XMM1d, XMM1e, XMM1f, XMM1g, XMM1h, XMM1i, XMM1j, XMM1k, XMM1l, XMM1m, XMM1n, XMM1o, XMM1p);
1107 
1108 reg_class xmm2_reg(XMM2, XMM2b, XMM2c, XMM2d);
1109 reg_class ymm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h);
1110 reg_class zmm2_reg(XMM2, XMM2b, XMM2c, XMM2d, XMM2e, XMM2f, XMM2g, XMM2h, XMM2i, XMM2j, XMM2k, XMM2l, XMM2m, XMM2n, XMM2o, XMM2p);
1111 
1112 reg_class xmm3_reg(XMM3, XMM3b, XMM3c, XMM3d);
1113 reg_class ymm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h);
1114 reg_class zmm3_reg(XMM3, XMM3b, XMM3c, XMM3d, XMM3e, XMM3f, XMM3g, XMM3h, XMM3i, XMM3j, XMM3k, XMM3l, XMM3m, XMM3n, XMM3o, XMM3p);
1115 
1116 reg_class xmm4_reg(XMM4, XMM4b, XMM4c, XMM4d);
1117 reg_class ymm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h);
1118 reg_class zmm4_reg(XMM4, XMM4b, XMM4c, XMM4d, XMM4e, XMM4f, XMM4g, XMM4h, XMM4i, XMM4j, XMM4k, XMM4l, XMM4m, XMM4n, XMM4o, XMM4p);
1119 
1120 reg_class xmm5_reg(XMM5, XMM5b, XMM5c, XMM5d);
1121 reg_class ymm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h);
1122 reg_class zmm5_reg(XMM5, XMM5b, XMM5c, XMM5d, XMM5e, XMM5f, XMM5g, XMM5h, XMM5i, XMM5j, XMM5k, XMM5l, XMM5m, XMM5n, XMM5o, XMM5p);
1123 
1124 reg_class xmm6_reg(XMM6, XMM6b, XMM6c, XMM6d);
1125 reg_class ymm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h);
1126 reg_class zmm6_reg(XMM6, XMM6b, XMM6c, XMM6d, XMM6e, XMM6f, XMM6g, XMM6h, XMM6i, XMM6j, XMM6k, XMM6l, XMM6m, XMM6n, XMM6o, XMM6p);
1127 
1128 reg_class xmm7_reg(XMM7, XMM7b, XMM7c, XMM7d);
1129 reg_class ymm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h);
1130 reg_class zmm7_reg(XMM7, XMM7b, XMM7c, XMM7d, XMM7e, XMM7f, XMM7g, XMM7h, XMM7i, XMM7j, XMM7k, XMM7l, XMM7m, XMM7n, XMM7o, XMM7p);
1131 
1132 #ifdef _LP64
1133 
1134 reg_class xmm8_reg(XMM8, XMM8b, XMM8c, XMM8d);
1135 reg_class ymm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h);
1136 reg_class zmm8_reg(XMM8, XMM8b, XMM8c, XMM8d, XMM8e, XMM8f, XMM8g, XMM8h, XMM8i, XMM8j, XMM8k, XMM8l, XMM8m, XMM8n, XMM8o, XMM8p);
1137 
1138 reg_class xmm9_reg(XMM9, XMM9b, XMM9c, XMM9d);
1139 reg_class ymm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h);
1140 reg_class zmm9_reg(XMM9, XMM9b, XMM9c, XMM9d, XMM9e, XMM9f, XMM9g, XMM9h, XMM9i, XMM9j, XMM9k, XMM9l, XMM9m, XMM9n, XMM9o, XMM9p);
1141 
1142 reg_class xmm10_reg(XMM10, XMM10b, XMM10c, XMM10d);
1143 reg_class ymm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h);
1144 reg_class zmm10_reg(XMM10, XMM10b, XMM10c, XMM10d, XMM10e, XMM10f, XMM10g, XMM10h, XMM10i, XMM10j, XMM10k, XMM10l, XMM10m, XMM10n, XMM10o, XMM10p);
1145 
1146 reg_class xmm11_reg(XMM11, XMM11b, XMM11c, XMM11d);
1147 reg_class ymm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h);
1148 reg_class zmm11_reg(XMM11, XMM11b, XMM11c, XMM11d, XMM11e, XMM11f, XMM11g, XMM11h, XMM11i, XMM11j, XMM11k, XMM11l, XMM11m, XMM11n, XMM11o, XMM11p);
1149 
1150 reg_class xmm12_reg(XMM12, XMM12b, XMM12c, XMM12d);
1151 reg_class ymm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h);
1152 reg_class zmm12_reg(XMM12, XMM12b, XMM12c, XMM12d, XMM12e, XMM12f, XMM12g, XMM12h, XMM12i, XMM12j, XMM12k, XMM12l, XMM12m, XMM12n, XMM12o, XMM12p);
1153 
1154 reg_class xmm13_reg(XMM13, XMM13b, XMM13c, XMM13d);
1155 reg_class ymm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h);
1156 reg_class zmm13_reg(XMM13, XMM13b, XMM13c, XMM13d, XMM13e, XMM13f, XMM13g, XMM13h, XMM13i, XMM13j, XMM13k, XMM13l, XMM13m, XMM13n, XMM13o, XMM13p);
1157 
1158 reg_class xmm14_reg(XMM14, XMM14b, XMM14c, XMM14d);
1159 reg_class ymm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h);
1160 reg_class zmm14_reg(XMM14, XMM14b, XMM14c, XMM14d, XMM14e, XMM14f, XMM14g, XMM14h, XMM14i, XMM14j, XMM14k, XMM14l, XMM14m, XMM14n, XMM14o, XMM14p);
1161 
1162 reg_class xmm15_reg(XMM15, XMM15b, XMM15c, XMM15d);
1163 reg_class ymm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h);
1164 reg_class zmm15_reg(XMM15, XMM15b, XMM15c, XMM15d, XMM15e, XMM15f, XMM15g, XMM15h, XMM15i, XMM15j, XMM15k, XMM15l, XMM15m, XMM15n, XMM15o, XMM15p);
1165 
1166 reg_class xmm16_reg(XMM16, XMM16b, XMM16c, XMM16d);
1167 reg_class ymm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h);
1168 reg_class zmm16_reg(XMM16, XMM16b, XMM16c, XMM16d, XMM16e, XMM16f, XMM16g, XMM16h, XMM16i, XMM16j, XMM16k, XMM16l, XMM16m, XMM16n, XMM16o, XMM16p);
1169 
1170 reg_class xmm17_reg(XMM17, XMM17b, XMM17c, XMM17d);
1171 reg_class ymm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h);
1172 reg_class zmm17_reg(XMM17, XMM17b, XMM17c, XMM17d, XMM17e, XMM17f, XMM17g, XMM17h, XMM17i, XMM17j, XMM17k, XMM17l, XMM17m, XMM17n, XMM17o, XMM17p);
1173 
1174 reg_class xmm18_reg(XMM18, XMM18b, XMM18c, XMM18d);
1175 reg_class ymm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h);
1176 reg_class zmm18_reg(XMM18, XMM18b, XMM18c, XMM18d, XMM18e, XMM18f, XMM18g, XMM18h, XMM18i, XMM18j, XMM18k, XMM18l, XMM18m, XMM18n, XMM18o, XMM18p);
1177 
1178 reg_class xmm19_reg(XMM19, XMM19b, XMM19c, XMM19d);
1179 reg_class ymm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h);
1180 reg_class zmm19_reg(XMM19, XMM19b, XMM19c, XMM19d, XMM19e, XMM19f, XMM19g, XMM19h, XMM19i, XMM19j, XMM19k, XMM19l, XMM19m, XMM19n, XMM19o, XMM19p);
1181 
1182 reg_class xmm20_reg(XMM20, XMM20b, XMM20c, XMM20d);
1183 reg_class ymm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h);
1184 reg_class zmm20_reg(XMM20, XMM20b, XMM20c, XMM20d, XMM20e, XMM20f, XMM20g, XMM20h, XMM20i, XMM20j, XMM20k, XMM20l, XMM20m, XMM20n, XMM20o, XMM20p);
1185 
1186 reg_class xmm21_reg(XMM21, XMM21b, XMM21c, XMM21d);
1187 reg_class ymm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h);
1188 reg_class zmm21_reg(XMM21, XMM21b, XMM21c, XMM21d, XMM21e, XMM21f, XMM21g, XMM21h, XMM21i, XMM21j, XMM21k, XMM21l, XMM21m, XMM21n, XMM21o, XMM21p);
1189 
1190 reg_class xmm22_reg(XMM22, XMM22b, XMM22c, XMM22d);
1191 reg_class ymm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h);
1192 reg_class zmm22_reg(XMM22, XMM22b, XMM22c, XMM22d, XMM22e, XMM22f, XMM22g, XMM22h, XMM22i, XMM22j, XMM22k, XMM22l, XMM22m, XMM22n, XMM22o, XMM22p);
1193 
1194 reg_class xmm23_reg(XMM23, XMM23b, XMM23c, XMM23d);
1195 reg_class ymm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h);
1196 reg_class zmm23_reg(XMM23, XMM23b, XMM23c, XMM23d, XMM23e, XMM23f, XMM23g, XMM23h, XMM23i, XMM23j, XMM23k, XMM23l, XMM23m, XMM23n, XMM23o, XMM23p);
1197 
1198 reg_class xmm24_reg(XMM24, XMM24b, XMM24c, XMM24d);
1199 reg_class ymm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h);
1200 reg_class zmm24_reg(XMM24, XMM24b, XMM24c, XMM24d, XMM24e, XMM24f, XMM24g, XMM24h, XMM24i, XMM24j, XMM24k, XMM24l, XMM24m, XMM24n, XMM24o, XMM24p);
1201 
1202 reg_class xmm25_reg(XMM25, XMM25b, XMM25c, XMM25d);
1203 reg_class ymm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h);
1204 reg_class zmm25_reg(XMM25, XMM25b, XMM25c, XMM25d, XMM25e, XMM25f, XMM25g, XMM25h, XMM25i, XMM25j, XMM25k, XMM25l, XMM25m, XMM25n, XMM25o, XMM25p);
1205 
1206 reg_class xmm26_reg(XMM26, XMM26b, XMM26c, XMM26d);
1207 reg_class ymm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h);
1208 reg_class zmm26_reg(XMM26, XMM26b, XMM26c, XMM26d, XMM26e, XMM26f, XMM26g, XMM26h, XMM26i, XMM26j, XMM26k, XMM26l, XMM26m, XMM26n, XMM26o, XMM26p);
1209 
1210 reg_class xmm27_reg(XMM27, XMM27b, XMM27c, XMM27d);
1211 reg_class ymm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h);
1212 reg_class zmm27_reg(XMM27, XMM27b, XMM27c, XMM27d, XMM27e, XMM27f, XMM27g, XMM27h, XMM27i, XMM27j, XMM27k, XMM27l, XMM27m, XMM27n, XMM27o, XMM27p);
1213 
1214 reg_class xmm28_reg(XMM28, XMM28b, XMM28c, XMM28d);
1215 reg_class ymm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h);
1216 reg_class zmm28_reg(XMM28, XMM28b, XMM28c, XMM28d, XMM28e, XMM28f, XMM28g, XMM28h, XMM28i, XMM28j, XMM28k, XMM28l, XMM28m, XMM28n, XMM28o, XMM28p);
1217 
1218 reg_class xmm29_reg(XMM29, XMM29b, XMM29c, XMM29d);
1219 reg_class ymm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h);
1220 reg_class zmm29_reg(XMM29, XMM29b, XMM29c, XMM29d, XMM29e, XMM29f, XMM29g, XMM29h, XMM29i, XMM29j, XMM29k, XMM29l, XMM29m, XMM29n, XMM29o, XMM29p);
1221 
1222 reg_class xmm30_reg(XMM30, XMM30b, XMM30c, XMM30d);
1223 reg_class ymm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h);
1224 reg_class zmm30_reg(XMM30, XMM30b, XMM30c, XMM30d, XMM30e, XMM30f, XMM30g, XMM30h, XMM30i, XMM30j, XMM30k, XMM30l, XMM30m, XMM30n, XMM30o, XMM30p);
1225 
1226 reg_class xmm31_reg(XMM31, XMM31b, XMM31c, XMM31d);
1227 reg_class ymm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h);
1228 reg_class zmm31_reg(XMM31, XMM31b, XMM31c, XMM31d, XMM31e, XMM31f, XMM31g, XMM31h, XMM31i, XMM31j, XMM31k, XMM31l, XMM31m, XMM31n, XMM31o, XMM31p);
1229 
1230 #endif
1231 
1232 %}
1233 
1234 
1235 //----------SOURCE BLOCK-------------------------------------------------------
1236 // This is a block of C++ code which provides values, functions, and
1237 // definitions necessary in the rest of the architecture description
1238 
1239 source_hpp %{
1240 // Header information of the source block.
1241 // Method declarations/definitions which are used outside
1242 // the ad-scope can conveniently be defined here.
1243 //
1244 // To keep related declarations/definitions/uses close together,
1245 // we switch between source %{ }% and source_hpp %{ }% freely as needed.
1246 
1247 class NativeJump;
1248 
1249 class CallStubImpl {
1250 
1251   //--------------------------------------------------------------
1252   //---<  Used for optimization in Compile::shorten_branches  >---
1253   //--------------------------------------------------------------
1254 
1255  public:
1256   // Size of call trampoline stub.
1257   static uint size_call_trampoline() {
1258     return 0; // no call trampolines on this platform
1259   }
1260 
1261   // number of relocations needed by a call trampoline stub
1262   static uint reloc_call_trampoline() {
1263     return 0; // no call trampolines on this platform
1264   }
1265 };
1266 
1267 class HandlerImpl {
1268 
1269  public:
1270 
1271   static int emit_exception_handler(CodeBuffer &cbuf);
1272   static int emit_deopt_handler(CodeBuffer& cbuf);
1273 
1274   static uint size_exception_handler() {
1275     // NativeCall instruction size is the same as NativeJump.
1276     // exception handler starts out as jump and can be patched to
1277     // a call be deoptimization.  (4932387)
1278     // Note that this value is also credited (in output.cpp) to
1279     // the size of the code section.
1280     return NativeJump::instruction_size;
1281   }
1282 
1283 #ifdef _LP64
1284   static uint size_deopt_handler() {
1285     // three 5 byte instructions plus one move for unreachable address.
1286     return 15+3;
1287   }
1288 #else
1289   static uint size_deopt_handler() {
1290     // NativeCall instruction size is the same as NativeJump.
1291     // exception handler starts out as jump and can be patched to
1292     // a call be deoptimization.  (4932387)
1293     // Note that this value is also credited (in output.cpp) to
1294     // the size of the code section.
1295     return 5 + NativeJump::instruction_size; // pushl(); jmp;
1296   }
1297 #endif
1298 };
1299 
1300 %} // end source_hpp
1301 
1302 source %{
1303 
1304 #include "opto/addnode.hpp"
1305 
1306 // Emit exception handler code.
1307 // Stuff framesize into a register and call a VM stub routine.
1308 int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
1309 
1310   // Note that the code buffer's insts_mark is always relative to insts.
1311   // That's why we must use the macroassembler to generate a handler.
1312   MacroAssembler _masm(&cbuf);
1313   address base = __ start_a_stub(size_exception_handler());
1314   if (base == NULL) {
1315     ciEnv::current()->record_failure("CodeCache is full");
1316     return 0;  // CodeBuffer::expand failed
1317   }
1318   int offset = __ offset();
1319   __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
1320   assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
1321   __ end_a_stub();
1322   return offset;
1323 }
1324 
1325 // Emit deopt handler code.
1326 int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
1327 
1328   // Note that the code buffer's insts_mark is always relative to insts.
1329   // That's why we must use the macroassembler to generate a handler.
1330   MacroAssembler _masm(&cbuf);
1331   address base = __ start_a_stub(size_deopt_handler());
1332   if (base == NULL) {
1333     ciEnv::current()->record_failure("CodeCache is full");
1334     return 0;  // CodeBuffer::expand failed
1335   }
1336   int offset = __ offset();
1337 
1338 #ifdef _LP64
1339   address the_pc = (address) __ pc();
1340   Label next;
1341   // push a "the_pc" on the stack without destroying any registers
1342   // as they all may be live.
1343 
1344   // push address of "next"
1345   __ call(next, relocInfo::none); // reloc none is fine since it is a disp32
1346   __ bind(next);
1347   // adjust it so it matches "the_pc"
1348   __ subptr(Address(rsp, 0), __ offset() - offset);
1349 #else
1350   InternalAddress here(__ pc());
1351   __ pushptr(here.addr());
1352 #endif
1353 
1354   __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1355   assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1356   __ end_a_stub();
1357   return offset;
1358 }
1359 
1360 
1361 //=============================================================================
1362 
1363   // Float masks come from different places depending on platform.
1364 #ifdef _LP64
1365   static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
1366   static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
1367   static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1368   static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1369 #else
1370   static address float_signmask()  { return (address)float_signmask_pool; }
1371   static address float_signflip()  { return (address)float_signflip_pool; }
1372   static address double_signmask() { return (address)double_signmask_pool; }
1373   static address double_signflip() { return (address)double_signflip_pool; }
1374 #endif
1375   static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1376   static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1377   static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
1378 
1379 //=============================================================================
1380 const bool Matcher::match_rule_supported(int opcode) {
1381   if (!has_match_rule(opcode))
1382     return false;
1383 
1384   bool ret_value = true;
1385   switch (opcode) {
1386     case Op_AbsVL:
1387       if (UseAVX < 3)
1388         ret_value = false;
1389     case Op_PopCountI:
1390     case Op_PopCountL:
1391       if (!UsePopCountInstruction)
1392         ret_value = false;
1393       break;
1394     case Op_PopCountVI:
1395       if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
1396         ret_value = false;
1397       break;
1398     case Op_MulVI:
1399       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
1400         ret_value = false;
1401       break;
1402     case Op_MulVL:
1403     case Op_MulReductionVL:
1404       if (VM_Version::supports_avx512dq() == false)
1405         ret_value = false;
1406       break;
1407     case Op_AddReductionVL:
1408       if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
1409         ret_value = false;
1410       break;
1411     case Op_AbsVB:
1412     case Op_AbsVS:
1413     case Op_AbsVI:
1414     case Op_AddReductionVI:
1415       if (UseSSE < 3) // requires at least SSE3
1416         ret_value = false;
1417       break;
1418     case Op_MulReductionVI:
1419       if (UseSSE < 4) // requires at least SSE4
1420         ret_value = false;
1421       break;
1422     case Op_AddReductionVF:
1423     case Op_AddReductionVD:
1424     case Op_MulReductionVF:
1425     case Op_MulReductionVD:
1426       if (UseSSE < 1) // requires at least SSE
1427         ret_value = false;
1428       break;
1429     case Op_SqrtVD:
1430     case Op_SqrtVF:
1431       if (UseAVX < 1) // enabled for AVX only
1432         ret_value = false;
1433       break;
1434     case Op_CompareAndSwapL:
1435 #ifdef _LP64
1436     case Op_CompareAndSwapP:
1437 #endif
1438       if (!VM_Version::supports_cx8())
1439         ret_value = false;
1440       break;
1441     case Op_CMoveVF:
1442     case Op_CMoveVD:
1443       if (UseAVX < 1 || UseAVX > 2)
1444         ret_value = false;
1445       break;
1446     case Op_StrIndexOf:
1447       if (!UseSSE42Intrinsics)
1448         ret_value = false;
1449       break;
1450     case Op_StrIndexOfChar:
1451       if (!UseSSE42Intrinsics)
1452         ret_value = false;
1453       break;
1454     case Op_OnSpinWait:
1455       if (VM_Version::supports_on_spin_wait() == false)
1456         ret_value = false;
1457       break;
1458     case Op_MulAddVS2VI:
1459     case Op_RShiftVL:
1460     case Op_AbsVD:
1461     case Op_NegVD:
1462       if (UseSSE < 2)
1463         ret_value = false;
1464       break;
1465     case Op_MulVB:
1466     case Op_LShiftVB:
1467     case Op_RShiftVB:
1468     case Op_URShiftVB:
1469       if (UseSSE < 4)
1470         ret_value = false;
1471       break;
1472 #ifdef _LP64
1473     case Op_MaxD:
1474     case Op_MaxF:
1475     case Op_MinD:
1476     case Op_MinF:
1477       if (UseAVX < 1) // enabled for AVX only
1478         ret_value = false;
1479       break;
1480 #endif
1481   }
1482 
1483   return ret_value;  // Per default match rules are supported.
1484 }
1485 
1486 const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
1487   // identify extra cases that we might want to provide match rules for
1488   // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
1489   bool ret_value = match_rule_supported(opcode);
1490   if (ret_value) {
1491     switch (opcode) {
1492       case Op_AbsVB:
1493       case Op_AddVB:
1494       case Op_SubVB:
1495         if ((vlen == 64) && (VM_Version::supports_avx512bw() == false))
1496           ret_value = false;
1497         break;
1498       case Op_AbsVS:
1499       case Op_AddVS:
1500       case Op_SubVS:
1501       case Op_MulVS:
1502       case Op_LShiftVS:
1503       case Op_RShiftVS:
1504       case Op_URShiftVS:
1505         if ((vlen == 32) && (VM_Version::supports_avx512bw() == false))
1506           ret_value = false;
1507         break;
1508       case Op_MulVB:
1509       case Op_LShiftVB:
1510       case Op_RShiftVB:
1511       case Op_URShiftVB:
1512         if ((vlen == 32 && UseAVX < 2) || 
1513             ((vlen == 64) && (VM_Version::supports_avx512bw() == false)))
1514           ret_value = false;
1515         break;
1516       case Op_NegVF:
1517         if ((vlen == 16) && (VM_Version::supports_avx512dq() == false))
1518           ret_value = false;
1519         break;
1520       case Op_CMoveVF:
1521         if (vlen != 8)
1522           ret_value  = false;
1523         break;
1524       case Op_NegVD:
1525         if ((vlen == 8) && (VM_Version::supports_avx512dq() == false))
1526           ret_value = false;
1527         break;
1528       case Op_CMoveVD:
1529         if (vlen != 4)
1530           ret_value  = false;
1531         break;
1532     }
1533   }
1534 
1535   return ret_value;  // Per default match rules are supported.
1536 }
1537 
1538 const bool Matcher::has_predicated_vectors(void) {
1539   bool ret_value = false;
1540   if (UseAVX > 2) {
1541     ret_value = VM_Version::supports_avx512vl();
1542   }
1543 
1544   return ret_value;
1545 }
1546 
1547 const int Matcher::float_pressure(int default_pressure_threshold) {
1548   int float_pressure_threshold = default_pressure_threshold;
1549 #ifdef _LP64
1550   if (UseAVX > 2) {
1551     // Increase pressure threshold on machines with AVX3 which have
1552     // 2x more XMM registers.
1553     float_pressure_threshold = default_pressure_threshold * 2;
1554   }
1555 #endif
1556   return float_pressure_threshold;
1557 }
1558 
1559 // Max vector size in bytes. 0 if not supported.
1560 const int Matcher::vector_width_in_bytes(BasicType bt) {
1561   assert(is_java_primitive(bt), "only primitive type vectors");
1562   if (UseSSE < 2) return 0;
1563   // SSE2 supports 128bit vectors for all types.
1564   // AVX2 supports 256bit vectors for all types.
1565   // AVX2/EVEX supports 512bit vectors for all types.
1566   int size = (UseAVX > 1) ? (1 << UseAVX) * 8 : 16;
1567   // AVX1 supports 256bit vectors only for FLOAT and DOUBLE.
1568   if (UseAVX > 0 && (bt == T_FLOAT || bt == T_DOUBLE))
1569     size = (UseAVX > 2) ? 64 : 32;
1570   if (UseAVX > 2 && (bt == T_BYTE || bt == T_SHORT || bt == T_CHAR))
1571     size = (VM_Version::supports_avx512bw()) ? 64 : 32;
1572   // Use flag to limit vector size.
1573   size = MIN2(size,(int)MaxVectorSize);
1574   // Minimum 2 values in vector (or 4 for bytes).
1575   switch (bt) {
1576   case T_DOUBLE:
1577   case T_LONG:
1578     if (size < 16) return 0;
1579     break;
1580   case T_FLOAT:
1581   case T_INT:
1582     if (size < 8) return 0;
1583     break;
1584   case T_BOOLEAN:
1585     if (size < 4) return 0;
1586     break;
1587   case T_CHAR:
1588     if (size < 4) return 0;
1589     break;
1590   case T_BYTE:
1591     if (size < 4) return 0;
1592     break;
1593   case T_SHORT:
1594     if (size < 4) return 0;
1595     break;
1596   default:
1597     ShouldNotReachHere();
1598   }
1599   return size;
1600 }
1601 
1602 // Limits on vector size (number of elements) loaded into vector.
1603 const int Matcher::max_vector_size(const BasicType bt) {
1604   return vector_width_in_bytes(bt)/type2aelembytes(bt);
1605 }
1606 const int Matcher::min_vector_size(const BasicType bt) {
1607   int max_size = max_vector_size(bt);
1608   // Min size which can be loaded into vector is 4 bytes.
1609   int size = (type2aelembytes(bt) == 1) ? 4 : 2;
1610   return MIN2(size,max_size);
1611 }
1612 
1613 // Vector ideal reg corresponding to specified size in bytes
1614 const uint Matcher::vector_ideal_reg(int size) {
1615   assert(MaxVectorSize >= size, "");
1616   switch(size) {
1617     case  4: return Op_VecS;
1618     case  8: return Op_VecD;
1619     case 16: return Op_VecX;
1620     case 32: return Op_VecY;
1621     case 64: return Op_VecZ;
1622   }
1623   ShouldNotReachHere();
1624   return 0;
1625 }
1626 
1627 // Only lowest bits of xmm reg are used for vector shift count.
1628 const uint Matcher::vector_shift_count_ideal_reg(int size) {
1629   return Op_VecS;
1630 }
1631 
1632 // x86 supports misaligned vectors store/load.
1633 const bool Matcher::misaligned_vectors_ok() {
1634   return true;
1635 }
1636 
1637 // x86 AES instructions are compatible with SunJCE expanded
1638 // keys, hence we do not need to pass the original key to stubs
1639 const bool Matcher::pass_original_key_for_aes() {
1640   return false;
1641 }
1642 
1643 
1644 const bool Matcher::convi2l_type_required = true;
1645 
1646 // Check for shift by small constant as well
1647 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
1648   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
1649       shift->in(2)->get_int() <= 3 &&
1650       // Are there other uses besides address expressions?
1651       !matcher->is_visited(shift)) {
1652     address_visited.set(shift->_idx); // Flag as address_visited
1653     mstack.push(shift->in(2), Matcher::Visit);
1654     Node *conv = shift->in(1);
1655 #ifdef _LP64
1656     // Allow Matcher to match the rule which bypass
1657     // ConvI2L operation for an array index on LP64
1658     // if the index value is positive.
1659     if (conv->Opcode() == Op_ConvI2L &&
1660         conv->as_Type()->type()->is_long()->_lo >= 0 &&
1661         // Are there other uses besides address expressions?
1662         !matcher->is_visited(conv)) {
1663       address_visited.set(conv->_idx); // Flag as address_visited
1664       mstack.push(conv->in(1), Matcher::Pre_Visit);
1665     } else
1666 #endif
1667       mstack.push(conv, Matcher::Pre_Visit);
1668     return true;
1669   }
1670   return false;
1671 }
1672 
1673 // Should the Matcher clone shifts on addressing modes, expecting them
1674 // to be subsumed into complex addressing expressions or compute them
1675 // into registers?
1676 bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
1677   Node *off = m->in(AddPNode::Offset);
1678   if (off->is_Con()) {
1679     address_visited.test_set(m->_idx); // Flag as address_visited
1680     Node *adr = m->in(AddPNode::Address);
1681 
1682     // Intel can handle 2 adds in addressing mode
1683     // AtomicAdd is not an addressing expression.
1684     // Cheap to find it by looking for screwy base.
1685     if (adr->is_AddP() &&
1686         !adr->in(AddPNode::Base)->is_top() &&
1687         LP64_ONLY( off->get_long() == (int) (off->get_long()) && ) // immL32
1688         // Are there other uses besides address expressions?
1689         !is_visited(adr)) {
1690       address_visited.set(adr->_idx); // Flag as address_visited
1691       Node *shift = adr->in(AddPNode::Offset);
1692       if (!clone_shift(shift, this, mstack, address_visited)) {
1693         mstack.push(shift, Pre_Visit);
1694       }
1695       mstack.push(adr->in(AddPNode::Address), Pre_Visit);
1696       mstack.push(adr->in(AddPNode::Base), Pre_Visit);
1697     } else {
1698       mstack.push(adr, Pre_Visit);
1699     }
1700 
1701     // Clone X+offset as it also folds into most addressing expressions
1702     mstack.push(off, Visit);
1703     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1704     return true;
1705   } else if (clone_shift(off, this, mstack, address_visited)) {
1706     address_visited.test_set(m->_idx); // Flag as address_visited
1707     mstack.push(m->in(AddPNode::Address), Pre_Visit);
1708     mstack.push(m->in(AddPNode::Base), Pre_Visit);
1709     return true;
1710   }
1711   return false;
1712 }
1713 
1714 void Compile::reshape_address(AddPNode* addp) {
1715 }
1716 
1717 // Helper methods for MachSpillCopyNode::implementation().
1718 static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
1719                           int src_hi, int dst_hi, uint ireg, outputStream* st) {
1720   // In 64-bit VM size calculation is very complex. Emitting instructions
1721   // into scratch buffer is used to get size in 64-bit VM.
1722   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1723   assert(ireg == Op_VecS || // 32bit vector
1724          (src_lo & 1) == 0 && (src_lo + 1) == src_hi &&
1725          (dst_lo & 1) == 0 && (dst_lo + 1) == dst_hi,
1726          "no non-adjacent vector moves" );
1727   if (cbuf) {
1728     MacroAssembler _masm(cbuf);
1729     int offset = __ offset();
1730     switch (ireg) {
1731     case Op_VecS: // copy whole register
1732     case Op_VecD:
1733     case Op_VecX:
1734 #ifndef _LP64
1735       __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1736 #else
1737       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1738         __ movdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1739       } else {
1740         __ vextractf32x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1741      }
1742 #endif
1743       break;
1744     case Op_VecY:
1745 #ifndef _LP64
1746       __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1747 #else
1748       if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1749         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]));
1750       } else {
1751         __ vextractf64x4(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 0x0);
1752      }
1753 #endif
1754       break;
1755     case Op_VecZ:
1756       __ evmovdquq(as_XMMRegister(Matcher::_regEncode[dst_lo]), as_XMMRegister(Matcher::_regEncode[src_lo]), 2);
1757       break;
1758     default:
1759       ShouldNotReachHere();
1760     }
1761     int size = __ offset() - offset;
1762 #ifdef ASSERT
1763     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1764     assert(!do_size || size == 4, "incorrect size calculattion");
1765 #endif
1766     return size;
1767 #ifndef PRODUCT
1768   } else if (!do_size) {
1769     switch (ireg) {
1770     case Op_VecS:
1771     case Op_VecD:
1772     case Op_VecX:
1773       st->print("movdqu  %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1774       break;
1775     case Op_VecY:
1776     case Op_VecZ:
1777       st->print("vmovdqu %s,%s\t# spill",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
1778       break;
1779     default:
1780       ShouldNotReachHere();
1781     }
1782 #endif
1783   }
1784   // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
1785   return (UseAVX > 2) ? 6 : 4;
1786 }
1787 
1788 static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
1789                             int stack_offset, int reg, uint ireg, outputStream* st) {
1790   // In 64-bit VM size calculation is very complex. Emitting instructions
1791   // into scratch buffer is used to get size in 64-bit VM.
1792   LP64_ONLY( assert(!do_size, "this method calculates size only for 32-bit VM"); )
1793   if (cbuf) {
1794     MacroAssembler _masm(cbuf);
1795     int offset = __ offset();
1796     if (is_load) {
1797       switch (ireg) {
1798       case Op_VecS:
1799         __ movdl(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1800         break;
1801       case Op_VecD:
1802         __ movq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1803         break;
1804       case Op_VecX:
1805 #ifndef _LP64
1806         __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1807 #else
1808         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1809           __ movdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1810         } else {
1811           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1812           __ vinsertf32x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1813         }
1814 #endif
1815         break;
1816       case Op_VecY:
1817 #ifndef _LP64
1818         __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1819 #else
1820         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1821           __ vmovdqu(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset));
1822         } else {
1823           __ vpxor(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1824           __ vinsertf64x4(as_XMMRegister(Matcher::_regEncode[reg]), as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset),0x0);
1825         }
1826 #endif
1827         break;
1828       case Op_VecZ:
1829         __ evmovdquq(as_XMMRegister(Matcher::_regEncode[reg]), Address(rsp, stack_offset), 2);
1830         break;
1831       default:
1832         ShouldNotReachHere();
1833       }
1834     } else { // store
1835       switch (ireg) {
1836       case Op_VecS:
1837         __ movdl(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1838         break;
1839       case Op_VecD:
1840         __ movq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1841         break;
1842       case Op_VecX:
1843 #ifndef _LP64
1844         __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1845 #else
1846         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1847           __ movdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1848         }
1849         else {
1850           __ vextractf32x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1851         }
1852 #endif
1853         break;
1854       case Op_VecY:
1855 #ifndef _LP64
1856         __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1857 #else
1858         if ((UseAVX < 3) || VM_Version::supports_avx512vl()) {
1859           __ vmovdqu(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]));
1860         }
1861         else {
1862           __ vextractf64x4(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 0x0);
1863         }
1864 #endif
1865         break;
1866       case Op_VecZ:
1867         __ evmovdquq(Address(rsp, stack_offset), as_XMMRegister(Matcher::_regEncode[reg]), 2);
1868         break;
1869       default:
1870         ShouldNotReachHere();
1871       }
1872     }
1873     int size = __ offset() - offset;
1874 #ifdef ASSERT
1875     int offset_size = (stack_offset == 0) ? 0 : ((stack_offset < 0x80) ? 1 : (UseAVX > 2) ? 6 : 4);
1876     // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1877     assert(!do_size || size == (5+offset_size), "incorrect size calculattion");
1878 #endif
1879     return size;
1880 #ifndef PRODUCT
1881   } else if (!do_size) {
1882     if (is_load) {
1883       switch (ireg) {
1884       case Op_VecS:
1885         st->print("movd    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1886         break;
1887       case Op_VecD:
1888         st->print("movq    %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1889         break;
1890        case Op_VecX:
1891         st->print("movdqu  %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1892         break;
1893       case Op_VecY:
1894       case Op_VecZ:
1895         st->print("vmovdqu %s,[rsp + %d]\t# spill", Matcher::regName[reg], stack_offset);
1896         break;
1897       default:
1898         ShouldNotReachHere();
1899       }
1900     } else { // store
1901       switch (ireg) {
1902       case Op_VecS:
1903         st->print("movd    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1904         break;
1905       case Op_VecD:
1906         st->print("movq    [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1907         break;
1908        case Op_VecX:
1909         st->print("movdqu  [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1910         break;
1911       case Op_VecY:
1912       case Op_VecZ:
1913         st->print("vmovdqu [rsp + %d],%s\t# spill", stack_offset, Matcher::regName[reg]);
1914         break;
1915       default:
1916         ShouldNotReachHere();
1917       }
1918     }
1919 #endif
1920   }
1921   bool is_single_byte = false;
1922   int vec_len = 0;
1923   if ((UseAVX > 2) && (stack_offset != 0)) {
1924     int tuple_type = Assembler::EVEX_FVM;
1925     int input_size = Assembler::EVEX_32bit;
1926     switch (ireg) {
1927     case Op_VecS:
1928       tuple_type = Assembler::EVEX_T1S;
1929       break;
1930     case Op_VecD:
1931       tuple_type = Assembler::EVEX_T1S;
1932       input_size = Assembler::EVEX_64bit;
1933       break;
1934     case Op_VecX:
1935       break;
1936     case Op_VecY:
1937       vec_len = 1;
1938       break;
1939     case Op_VecZ:
1940       vec_len = 2;
1941       break;
1942     }
1943     is_single_byte = Assembler::query_compressed_disp_byte(stack_offset, true, vec_len, tuple_type, input_size, 0);
1944   }
1945   int offset_size = 0;
1946   int size = 5;
1947   if (UseAVX > 2 ) {
1948     if (VM_Version::supports_avx512novl() && (vec_len == 2)) {
1949       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1950       size += 2; // Need an additional two bytes for EVEX encoding
1951     } else if (VM_Version::supports_avx512novl() && (vec_len < 2)) {
1952       offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1953     } else {
1954       offset_size = (stack_offset == 0) ? 0 : ((is_single_byte) ? 1 : 4);
1955       size += 2; // Need an additional two bytes for EVEX encodding
1956     }
1957   } else {
1958     offset_size = (stack_offset == 0) ? 0 : ((stack_offset <= 127) ? 1 : 4);
1959   }
1960   // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
1961   return size+offset_size;
1962 }
1963 
1964 static inline jint replicate4_imm(int con, int width) {
1965   // Load a constant of "width" (in bytes) and replicate it to fill 32bit.
1966   assert(width == 1 || width == 2, "only byte or short types here");
1967   int bit_width = width * 8;
1968   jint val = con;
1969   val &= (1 << bit_width) - 1;  // mask off sign bits
1970   while(bit_width < 32) {
1971     val |= (val << bit_width);
1972     bit_width <<= 1;
1973   }
1974   return val;
1975 }
1976 
1977 static inline jlong replicate8_imm(int con, int width) {
1978   // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
1979   assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
1980   int bit_width = width * 8;
1981   jlong val = con;
1982   val &= (((jlong) 1) << bit_width) - 1;  // mask off sign bits
1983   while(bit_width < 64) {
1984     val |= (val << bit_width);
1985     bit_width <<= 1;
1986   }
1987   return val;
1988 }
1989 
1990 #ifndef PRODUCT
1991   void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
1992     st->print("nop \t# %d bytes pad for loops and calls", _count);
1993   }
1994 #endif
1995 
1996   void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
1997     MacroAssembler _masm(&cbuf);
1998     __ nop(_count);
1999   }
2000 
2001   uint MachNopNode::size(PhaseRegAlloc*) const {
2002     return _count;
2003   }
2004 
2005 #ifndef PRODUCT
2006   void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2007     st->print("# breakpoint");
2008   }
2009 #endif
2010 
2011   void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc* ra_) const {
2012     MacroAssembler _masm(&cbuf);
2013     __ int3();
2014   }
2015 
2016   uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
2017     return MachNode::size(ra_);
2018   }
2019 
2020 %}
2021 
2022 encode %{
2023 
2024   enc_class call_epilog %{
2025     MacroAssembler _masm(&cbuf);
2026     if (VerifyStackAtCalls) {
2027       // Check that stack depth is unchanged: find majik cookie on stack
2028       int framesize = ra_->reg2offset_unchecked(OptoReg::add(ra_->_matcher._old_SP, -3*VMRegImpl::slots_per_word));
2029       Label L;
2030       __ cmpptr(Address(rsp, framesize), (int32_t)0xbadb100d);
2031       __ jccb(Assembler::equal, L);
2032       // Die if stack mismatch
2033       __ int3();
2034       __ bind(L);
2035     }
2036     __ oopmap_metadata(-1);
2037   %}
2038 
2039 %}
2040 
2041 
2042 //----------OPERANDS-----------------------------------------------------------
2043 // Operand definitions must precede instruction definitions for correct parsing
2044 // in the ADLC because operands constitute user defined types which are used in
2045 // instruction definitions.
2046 
2047 operand vecZ() %{
2048   constraint(ALLOC_IN_RC(vectorz_reg));
2049   match(VecZ);
2050 
2051   format %{ %}
2052   interface(REG_INTER);
2053 %}
2054 
2055 operand legVecZ() %{
2056   constraint(ALLOC_IN_RC(vectorz_reg_vl));
2057   match(VecZ);
2058 
2059   format %{ %}
2060   interface(REG_INTER);
2061 %}
2062 
2063 // Comparison Code for FP conditional move
2064 operand cmpOp_vcmppd() %{
2065   match(Bool);
2066 
2067   predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
2068             n->as_Bool()->_test._test != BoolTest::no_overflow);
2069   format %{ "" %}
2070   interface(COND_INTER) %{
2071     equal        (0x0, "eq");
2072     less         (0x1, "lt");
2073     less_equal   (0x2, "le");
2074     not_equal    (0xC, "ne");
2075     greater_equal(0xD, "ge");
2076     greater      (0xE, "gt");
2077     //TODO cannot compile (adlc breaks) without two next lines with error:
2078     // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
2079     // equal' for overflow.
2080     overflow     (0x20, "o");  // not really supported by the instruction
2081     no_overflow  (0x21, "no"); // not really supported by the instruction
2082   %}
2083 %}
2084 
2085 
2086 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
2087 
2088 // ============================================================================
2089 
2090 instruct ShouldNotReachHere() %{
2091   match(Halt);
2092   format %{ "ud2\t# ShouldNotReachHere" %}
2093   ins_encode %{
2094     __ ud2();
2095   %}
2096   ins_pipe(pipe_slow);
2097 %}
2098 
2099 // =================================EVEX special===============================
2100 
2101 instruct setMask(rRegI dst, rRegI src) %{
2102   predicate(Matcher::has_predicated_vectors());
2103   match(Set dst (SetVectMaskI  src));
2104   effect(TEMP dst);
2105   format %{ "setvectmask   $dst, $src" %}
2106   ins_encode %{
2107     __ setvectmask($dst$$Register, $src$$Register);
2108   %}
2109   ins_pipe(pipe_slow);
2110 %}
2111 
2112 // ============================================================================
2113 
2114 instruct addF_reg(regF dst, regF src) %{
2115   predicate((UseSSE>=1) && (UseAVX == 0));
2116   match(Set dst (AddF dst src));
2117 
2118   format %{ "addss   $dst, $src" %}
2119   ins_cost(150);
2120   ins_encode %{
2121     __ addss($dst$$XMMRegister, $src$$XMMRegister);
2122   %}
2123   ins_pipe(pipe_slow);
2124 %}
2125 
2126 instruct addF_mem(regF dst, memory src) %{
2127   predicate((UseSSE>=1) && (UseAVX == 0));
2128   match(Set dst (AddF dst (LoadF src)));
2129 
2130   format %{ "addss   $dst, $src" %}
2131   ins_cost(150);
2132   ins_encode %{
2133     __ addss($dst$$XMMRegister, $src$$Address);
2134   %}
2135   ins_pipe(pipe_slow);
2136 %}
2137 
2138 instruct addF_imm(regF dst, immF con) %{
2139   predicate((UseSSE>=1) && (UseAVX == 0));
2140   match(Set dst (AddF dst con));
2141   format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2142   ins_cost(150);
2143   ins_encode %{
2144     __ addss($dst$$XMMRegister, $constantaddress($con));
2145   %}
2146   ins_pipe(pipe_slow);
2147 %}
2148 
2149 instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
2150   predicate(UseAVX > 0);
2151   match(Set dst (AddF src1 src2));
2152 
2153   format %{ "vaddss  $dst, $src1, $src2" %}
2154   ins_cost(150);
2155   ins_encode %{
2156     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2157   %}
2158   ins_pipe(pipe_slow);
2159 %}
2160 
2161 instruct addF_reg_mem(regF dst, regF src1, memory src2) %{
2162   predicate(UseAVX > 0);
2163   match(Set dst (AddF src1 (LoadF src2)));
2164 
2165   format %{ "vaddss  $dst, $src1, $src2" %}
2166   ins_cost(150);
2167   ins_encode %{
2168     __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2169   %}
2170   ins_pipe(pipe_slow);
2171 %}
2172 
2173 instruct addF_reg_imm(regF dst, regF src, immF con) %{
2174   predicate(UseAVX > 0);
2175   match(Set dst (AddF src con));
2176 
2177   format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2178   ins_cost(150);
2179   ins_encode %{
2180     __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2181   %}
2182   ins_pipe(pipe_slow);
2183 %}
2184 
2185 instruct addD_reg(regD dst, regD src) %{
2186   predicate((UseSSE>=2) && (UseAVX == 0));
2187   match(Set dst (AddD dst src));
2188 
2189   format %{ "addsd   $dst, $src" %}
2190   ins_cost(150);
2191   ins_encode %{
2192     __ addsd($dst$$XMMRegister, $src$$XMMRegister);
2193   %}
2194   ins_pipe(pipe_slow);
2195 %}
2196 
2197 instruct addD_mem(regD dst, memory src) %{
2198   predicate((UseSSE>=2) && (UseAVX == 0));
2199   match(Set dst (AddD dst (LoadD src)));
2200 
2201   format %{ "addsd   $dst, $src" %}
2202   ins_cost(150);
2203   ins_encode %{
2204     __ addsd($dst$$XMMRegister, $src$$Address);
2205   %}
2206   ins_pipe(pipe_slow);
2207 %}
2208 
2209 instruct addD_imm(regD dst, immD con) %{
2210   predicate((UseSSE>=2) && (UseAVX == 0));
2211   match(Set dst (AddD dst con));
2212   format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2213   ins_cost(150);
2214   ins_encode %{
2215     __ addsd($dst$$XMMRegister, $constantaddress($con));
2216   %}
2217   ins_pipe(pipe_slow);
2218 %}
2219 
2220 instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
2221   predicate(UseAVX > 0);
2222   match(Set dst (AddD src1 src2));
2223 
2224   format %{ "vaddsd  $dst, $src1, $src2" %}
2225   ins_cost(150);
2226   ins_encode %{
2227     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2228   %}
2229   ins_pipe(pipe_slow);
2230 %}
2231 
2232 instruct addD_reg_mem(regD dst, regD src1, memory src2) %{
2233   predicate(UseAVX > 0);
2234   match(Set dst (AddD src1 (LoadD src2)));
2235 
2236   format %{ "vaddsd  $dst, $src1, $src2" %}
2237   ins_cost(150);
2238   ins_encode %{
2239     __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2240   %}
2241   ins_pipe(pipe_slow);
2242 %}
2243 
2244 instruct addD_reg_imm(regD dst, regD src, immD con) %{
2245   predicate(UseAVX > 0);
2246   match(Set dst (AddD src con));
2247 
2248   format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2249   ins_cost(150);
2250   ins_encode %{
2251     __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2252   %}
2253   ins_pipe(pipe_slow);
2254 %}
2255 
2256 instruct subF_reg(regF dst, regF src) %{
2257   predicate((UseSSE>=1) && (UseAVX == 0));
2258   match(Set dst (SubF dst src));
2259 
2260   format %{ "subss   $dst, $src" %}
2261   ins_cost(150);
2262   ins_encode %{
2263     __ subss($dst$$XMMRegister, $src$$XMMRegister);
2264   %}
2265   ins_pipe(pipe_slow);
2266 %}
2267 
2268 instruct subF_mem(regF dst, memory src) %{
2269   predicate((UseSSE>=1) && (UseAVX == 0));
2270   match(Set dst (SubF dst (LoadF src)));
2271 
2272   format %{ "subss   $dst, $src" %}
2273   ins_cost(150);
2274   ins_encode %{
2275     __ subss($dst$$XMMRegister, $src$$Address);
2276   %}
2277   ins_pipe(pipe_slow);
2278 %}
2279 
2280 instruct subF_imm(regF dst, immF con) %{
2281   predicate((UseSSE>=1) && (UseAVX == 0));
2282   match(Set dst (SubF dst con));
2283   format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2284   ins_cost(150);
2285   ins_encode %{
2286     __ subss($dst$$XMMRegister, $constantaddress($con));
2287   %}
2288   ins_pipe(pipe_slow);
2289 %}
2290 
2291 instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
2292   predicate(UseAVX > 0);
2293   match(Set dst (SubF src1 src2));
2294 
2295   format %{ "vsubss  $dst, $src1, $src2" %}
2296   ins_cost(150);
2297   ins_encode %{
2298     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2299   %}
2300   ins_pipe(pipe_slow);
2301 %}
2302 
2303 instruct subF_reg_mem(regF dst, regF src1, memory src2) %{
2304   predicate(UseAVX > 0);
2305   match(Set dst (SubF src1 (LoadF src2)));
2306 
2307   format %{ "vsubss  $dst, $src1, $src2" %}
2308   ins_cost(150);
2309   ins_encode %{
2310     __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2311   %}
2312   ins_pipe(pipe_slow);
2313 %}
2314 
2315 instruct subF_reg_imm(regF dst, regF src, immF con) %{
2316   predicate(UseAVX > 0);
2317   match(Set dst (SubF src con));
2318 
2319   format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2320   ins_cost(150);
2321   ins_encode %{
2322     __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2323   %}
2324   ins_pipe(pipe_slow);
2325 %}
2326 
2327 instruct subD_reg(regD dst, regD src) %{
2328   predicate((UseSSE>=2) && (UseAVX == 0));
2329   match(Set dst (SubD dst src));
2330 
2331   format %{ "subsd   $dst, $src" %}
2332   ins_cost(150);
2333   ins_encode %{
2334     __ subsd($dst$$XMMRegister, $src$$XMMRegister);
2335   %}
2336   ins_pipe(pipe_slow);
2337 %}
2338 
2339 instruct subD_mem(regD dst, memory src) %{
2340   predicate((UseSSE>=2) && (UseAVX == 0));
2341   match(Set dst (SubD dst (LoadD src)));
2342 
2343   format %{ "subsd   $dst, $src" %}
2344   ins_cost(150);
2345   ins_encode %{
2346     __ subsd($dst$$XMMRegister, $src$$Address);
2347   %}
2348   ins_pipe(pipe_slow);
2349 %}
2350 
2351 instruct subD_imm(regD dst, immD con) %{
2352   predicate((UseSSE>=2) && (UseAVX == 0));
2353   match(Set dst (SubD dst con));
2354   format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2355   ins_cost(150);
2356   ins_encode %{
2357     __ subsd($dst$$XMMRegister, $constantaddress($con));
2358   %}
2359   ins_pipe(pipe_slow);
2360 %}
2361 
2362 instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
2363   predicate(UseAVX > 0);
2364   match(Set dst (SubD src1 src2));
2365 
2366   format %{ "vsubsd  $dst, $src1, $src2" %}
2367   ins_cost(150);
2368   ins_encode %{
2369     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2370   %}
2371   ins_pipe(pipe_slow);
2372 %}
2373 
2374 instruct subD_reg_mem(regD dst, regD src1, memory src2) %{
2375   predicate(UseAVX > 0);
2376   match(Set dst (SubD src1 (LoadD src2)));
2377 
2378   format %{ "vsubsd  $dst, $src1, $src2" %}
2379   ins_cost(150);
2380   ins_encode %{
2381     __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2382   %}
2383   ins_pipe(pipe_slow);
2384 %}
2385 
2386 instruct subD_reg_imm(regD dst, regD src, immD con) %{
2387   predicate(UseAVX > 0);
2388   match(Set dst (SubD src con));
2389 
2390   format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2391   ins_cost(150);
2392   ins_encode %{
2393     __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2394   %}
2395   ins_pipe(pipe_slow);
2396 %}
2397 
2398 instruct mulF_reg(regF dst, regF src) %{
2399   predicate((UseSSE>=1) && (UseAVX == 0));
2400   match(Set dst (MulF dst src));
2401 
2402   format %{ "mulss   $dst, $src" %}
2403   ins_cost(150);
2404   ins_encode %{
2405     __ mulss($dst$$XMMRegister, $src$$XMMRegister);
2406   %}
2407   ins_pipe(pipe_slow);
2408 %}
2409 
2410 instruct mulF_mem(regF dst, memory src) %{
2411   predicate((UseSSE>=1) && (UseAVX == 0));
2412   match(Set dst (MulF dst (LoadF src)));
2413 
2414   format %{ "mulss   $dst, $src" %}
2415   ins_cost(150);
2416   ins_encode %{
2417     __ mulss($dst$$XMMRegister, $src$$Address);
2418   %}
2419   ins_pipe(pipe_slow);
2420 %}
2421 
2422 instruct mulF_imm(regF dst, immF con) %{
2423   predicate((UseSSE>=1) && (UseAVX == 0));
2424   match(Set dst (MulF dst con));
2425   format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2426   ins_cost(150);
2427   ins_encode %{
2428     __ mulss($dst$$XMMRegister, $constantaddress($con));
2429   %}
2430   ins_pipe(pipe_slow);
2431 %}
2432 
2433 instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
2434   predicate(UseAVX > 0);
2435   match(Set dst (MulF src1 src2));
2436 
2437   format %{ "vmulss  $dst, $src1, $src2" %}
2438   ins_cost(150);
2439   ins_encode %{
2440     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2441   %}
2442   ins_pipe(pipe_slow);
2443 %}
2444 
2445 instruct mulF_reg_mem(regF dst, regF src1, memory src2) %{
2446   predicate(UseAVX > 0);
2447   match(Set dst (MulF src1 (LoadF src2)));
2448 
2449   format %{ "vmulss  $dst, $src1, $src2" %}
2450   ins_cost(150);
2451   ins_encode %{
2452     __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2453   %}
2454   ins_pipe(pipe_slow);
2455 %}
2456 
2457 instruct mulF_reg_imm(regF dst, regF src, immF con) %{
2458   predicate(UseAVX > 0);
2459   match(Set dst (MulF src con));
2460 
2461   format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2462   ins_cost(150);
2463   ins_encode %{
2464     __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2465   %}
2466   ins_pipe(pipe_slow);
2467 %}
2468 
2469 instruct mulD_reg(regD dst, regD src) %{
2470   predicate((UseSSE>=2) && (UseAVX == 0));
2471   match(Set dst (MulD dst src));
2472 
2473   format %{ "mulsd   $dst, $src" %}
2474   ins_cost(150);
2475   ins_encode %{
2476     __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
2477   %}
2478   ins_pipe(pipe_slow);
2479 %}
2480 
2481 instruct mulD_mem(regD dst, memory src) %{
2482   predicate((UseSSE>=2) && (UseAVX == 0));
2483   match(Set dst (MulD dst (LoadD src)));
2484 
2485   format %{ "mulsd   $dst, $src" %}
2486   ins_cost(150);
2487   ins_encode %{
2488     __ mulsd($dst$$XMMRegister, $src$$Address);
2489   %}
2490   ins_pipe(pipe_slow);
2491 %}
2492 
2493 instruct mulD_imm(regD dst, immD con) %{
2494   predicate((UseSSE>=2) && (UseAVX == 0));
2495   match(Set dst (MulD dst con));
2496   format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2497   ins_cost(150);
2498   ins_encode %{
2499     __ mulsd($dst$$XMMRegister, $constantaddress($con));
2500   %}
2501   ins_pipe(pipe_slow);
2502 %}
2503 
2504 instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
2505   predicate(UseAVX > 0);
2506   match(Set dst (MulD src1 src2));
2507 
2508   format %{ "vmulsd  $dst, $src1, $src2" %}
2509   ins_cost(150);
2510   ins_encode %{
2511     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2512   %}
2513   ins_pipe(pipe_slow);
2514 %}
2515 
2516 instruct mulD_reg_mem(regD dst, regD src1, memory src2) %{
2517   predicate(UseAVX > 0);
2518   match(Set dst (MulD src1 (LoadD src2)));
2519 
2520   format %{ "vmulsd  $dst, $src1, $src2" %}
2521   ins_cost(150);
2522   ins_encode %{
2523     __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2524   %}
2525   ins_pipe(pipe_slow);
2526 %}
2527 
2528 instruct mulD_reg_imm(regD dst, regD src, immD con) %{
2529   predicate(UseAVX > 0);
2530   match(Set dst (MulD src con));
2531 
2532   format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2533   ins_cost(150);
2534   ins_encode %{
2535     __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2536   %}
2537   ins_pipe(pipe_slow);
2538 %}
2539 
2540 instruct divF_reg(regF dst, regF src) %{
2541   predicate((UseSSE>=1) && (UseAVX == 0));
2542   match(Set dst (DivF dst src));
2543 
2544   format %{ "divss   $dst, $src" %}
2545   ins_cost(150);
2546   ins_encode %{
2547     __ divss($dst$$XMMRegister, $src$$XMMRegister);
2548   %}
2549   ins_pipe(pipe_slow);
2550 %}
2551 
2552 instruct divF_mem(regF dst, memory src) %{
2553   predicate((UseSSE>=1) && (UseAVX == 0));
2554   match(Set dst (DivF dst (LoadF src)));
2555 
2556   format %{ "divss   $dst, $src" %}
2557   ins_cost(150);
2558   ins_encode %{
2559     __ divss($dst$$XMMRegister, $src$$Address);
2560   %}
2561   ins_pipe(pipe_slow);
2562 %}
2563 
2564 instruct divF_imm(regF dst, immF con) %{
2565   predicate((UseSSE>=1) && (UseAVX == 0));
2566   match(Set dst (DivF dst con));
2567   format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2568   ins_cost(150);
2569   ins_encode %{
2570     __ divss($dst$$XMMRegister, $constantaddress($con));
2571   %}
2572   ins_pipe(pipe_slow);
2573 %}
2574 
2575 instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
2576   predicate(UseAVX > 0);
2577   match(Set dst (DivF src1 src2));
2578 
2579   format %{ "vdivss  $dst, $src1, $src2" %}
2580   ins_cost(150);
2581   ins_encode %{
2582     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2583   %}
2584   ins_pipe(pipe_slow);
2585 %}
2586 
2587 instruct divF_reg_mem(regF dst, regF src1, memory src2) %{
2588   predicate(UseAVX > 0);
2589   match(Set dst (DivF src1 (LoadF src2)));
2590 
2591   format %{ "vdivss  $dst, $src1, $src2" %}
2592   ins_cost(150);
2593   ins_encode %{
2594     __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2595   %}
2596   ins_pipe(pipe_slow);
2597 %}
2598 
2599 instruct divF_reg_imm(regF dst, regF src, immF con) %{
2600   predicate(UseAVX > 0);
2601   match(Set dst (DivF src con));
2602 
2603   format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
2604   ins_cost(150);
2605   ins_encode %{
2606     __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2607   %}
2608   ins_pipe(pipe_slow);
2609 %}
2610 
2611 instruct divD_reg(regD dst, regD src) %{
2612   predicate((UseSSE>=2) && (UseAVX == 0));
2613   match(Set dst (DivD dst src));
2614 
2615   format %{ "divsd   $dst, $src" %}
2616   ins_cost(150);
2617   ins_encode %{
2618     __ divsd($dst$$XMMRegister, $src$$XMMRegister);
2619   %}
2620   ins_pipe(pipe_slow);
2621 %}
2622 
2623 instruct divD_mem(regD dst, memory src) %{
2624   predicate((UseSSE>=2) && (UseAVX == 0));
2625   match(Set dst (DivD dst (LoadD src)));
2626 
2627   format %{ "divsd   $dst, $src" %}
2628   ins_cost(150);
2629   ins_encode %{
2630     __ divsd($dst$$XMMRegister, $src$$Address);
2631   %}
2632   ins_pipe(pipe_slow);
2633 %}
2634 
2635 instruct divD_imm(regD dst, immD con) %{
2636   predicate((UseSSE>=2) && (UseAVX == 0));
2637   match(Set dst (DivD dst con));
2638   format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2639   ins_cost(150);
2640   ins_encode %{
2641     __ divsd($dst$$XMMRegister, $constantaddress($con));
2642   %}
2643   ins_pipe(pipe_slow);
2644 %}
2645 
2646 instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
2647   predicate(UseAVX > 0);
2648   match(Set dst (DivD src1 src2));
2649 
2650   format %{ "vdivsd  $dst, $src1, $src2" %}
2651   ins_cost(150);
2652   ins_encode %{
2653     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
2654   %}
2655   ins_pipe(pipe_slow);
2656 %}
2657 
2658 instruct divD_reg_mem(regD dst, regD src1, memory src2) %{
2659   predicate(UseAVX > 0);
2660   match(Set dst (DivD src1 (LoadD src2)));
2661 
2662   format %{ "vdivsd  $dst, $src1, $src2" %}
2663   ins_cost(150);
2664   ins_encode %{
2665     __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
2666   %}
2667   ins_pipe(pipe_slow);
2668 %}
2669 
2670 instruct divD_reg_imm(regD dst, regD src, immD con) %{
2671   predicate(UseAVX > 0);
2672   match(Set dst (DivD src con));
2673 
2674   format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
2675   ins_cost(150);
2676   ins_encode %{
2677     __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
2678   %}
2679   ins_pipe(pipe_slow);
2680 %}
2681 
2682 instruct absF_reg(regF dst) %{
2683   predicate((UseSSE>=1) && (UseAVX == 0));
2684   match(Set dst (AbsF dst));
2685   ins_cost(150);
2686   format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
2687   ins_encode %{
2688     __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
2689   %}
2690   ins_pipe(pipe_slow);
2691 %}
2692 
2693 instruct absF_reg_reg(vlRegF dst, vlRegF src) %{
2694   predicate(UseAVX > 0);
2695   match(Set dst (AbsF src));
2696   ins_cost(150);
2697   format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
2698   ins_encode %{
2699     int vector_len = 0;
2700     __ vandps($dst$$XMMRegister, $src$$XMMRegister,
2701               ExternalAddress(float_signmask()), vector_len);
2702   %}
2703   ins_pipe(pipe_slow);
2704 %}
2705 
2706 instruct absD_reg(regD dst) %{
2707   predicate((UseSSE>=2) && (UseAVX == 0));
2708   match(Set dst (AbsD dst));
2709   ins_cost(150);
2710   format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
2711             "# abs double by sign masking" %}
2712   ins_encode %{
2713     __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
2714   %}
2715   ins_pipe(pipe_slow);
2716 %}
2717 
2718 instruct absD_reg_reg(vlRegD dst, vlRegD src) %{
2719   predicate(UseAVX > 0);
2720   match(Set dst (AbsD src));
2721   ins_cost(150);
2722   format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
2723             "# abs double by sign masking" %}
2724   ins_encode %{
2725     int vector_len = 0;
2726     __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
2727               ExternalAddress(double_signmask()), vector_len);
2728   %}
2729   ins_pipe(pipe_slow);
2730 %}
2731 
2732 instruct negF_reg(regF dst) %{
2733   predicate((UseSSE>=1) && (UseAVX == 0));
2734   match(Set dst (NegF dst));
2735   ins_cost(150);
2736   format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
2737   ins_encode %{
2738     __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
2739   %}
2740   ins_pipe(pipe_slow);
2741 %}
2742 
2743 instruct negF_reg_reg(vlRegF dst, vlRegF src) %{
2744   predicate(UseAVX > 0);
2745   match(Set dst (NegF src));
2746   ins_cost(150);
2747   format %{ "vnegatess  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
2748   ins_encode %{
2749     __ vnegatess($dst$$XMMRegister, $src$$XMMRegister,
2750                  ExternalAddress(float_signflip()));
2751   %}
2752   ins_pipe(pipe_slow);
2753 %}
2754 
2755 instruct negD_reg(regD dst) %{
2756   predicate((UseSSE>=2) && (UseAVX == 0));
2757   match(Set dst (NegD dst));
2758   ins_cost(150);
2759   format %{ "xorpd   $dst, [0x8000000000000000]\t"
2760             "# neg double by sign flipping" %}
2761   ins_encode %{
2762     __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
2763   %}
2764   ins_pipe(pipe_slow);
2765 %}
2766 
2767 instruct negD_reg_reg(vlRegD dst, vlRegD src) %{
2768   predicate(UseAVX > 0);
2769   match(Set dst (NegD src));
2770   ins_cost(150);
2771   format %{ "vnegatesd  $dst, $src, [0x8000000000000000]\t"
2772             "# neg double by sign flipping" %}
2773   ins_encode %{
2774     __ vnegatesd($dst$$XMMRegister, $src$$XMMRegister,
2775                  ExternalAddress(double_signflip()));
2776   %}
2777   ins_pipe(pipe_slow);
2778 %}
2779 
2780 instruct sqrtF_reg(regF dst, regF src) %{
2781   predicate(UseSSE>=1);
2782   match(Set dst (SqrtF src));
2783 
2784   format %{ "sqrtss  $dst, $src" %}
2785   ins_cost(150);
2786   ins_encode %{
2787     __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
2788   %}
2789   ins_pipe(pipe_slow);
2790 %}
2791 
2792 instruct sqrtF_mem(regF dst, memory src) %{
2793   predicate(UseSSE>=1);
2794   match(Set dst (SqrtF (LoadF src)));
2795 
2796   format %{ "sqrtss  $dst, $src" %}
2797   ins_cost(150);
2798   ins_encode %{
2799     __ sqrtss($dst$$XMMRegister, $src$$Address);
2800   %}
2801   ins_pipe(pipe_slow);
2802 %}
2803 
2804 instruct sqrtF_imm(regF dst, immF con) %{
2805   predicate(UseSSE>=1);
2806   match(Set dst (SqrtF con));
2807 
2808   format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
2809   ins_cost(150);
2810   ins_encode %{
2811     __ sqrtss($dst$$XMMRegister, $constantaddress($con));
2812   %}
2813   ins_pipe(pipe_slow);
2814 %}
2815 
2816 instruct sqrtD_reg(regD dst, regD src) %{
2817   predicate(UseSSE>=2);
2818   match(Set dst (SqrtD src));
2819 
2820   format %{ "sqrtsd  $dst, $src" %}
2821   ins_cost(150);
2822   ins_encode %{
2823     __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
2824   %}
2825   ins_pipe(pipe_slow);
2826 %}
2827 
2828 instruct sqrtD_mem(regD dst, memory src) %{
2829   predicate(UseSSE>=2);
2830   match(Set dst (SqrtD (LoadD src)));
2831 
2832   format %{ "sqrtsd  $dst, $src" %}
2833   ins_cost(150);
2834   ins_encode %{
2835     __ sqrtsd($dst$$XMMRegister, $src$$Address);
2836   %}
2837   ins_pipe(pipe_slow);
2838 %}
2839 
2840 instruct sqrtD_imm(regD dst, immD con) %{
2841   predicate(UseSSE>=2);
2842   match(Set dst (SqrtD con));
2843   format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
2844   ins_cost(150);
2845   ins_encode %{
2846     __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
2847   %}
2848   ins_pipe(pipe_slow);
2849 %}
2850 
2851 instruct onspinwait() %{
2852   match(OnSpinWait);
2853   ins_cost(200);
2854 
2855   format %{
2856     $$template
2857     $$emit$$"pause\t! membar_onspinwait"
2858   %}
2859   ins_encode %{
2860     __ pause();
2861   %}
2862   ins_pipe(pipe_slow);
2863 %}
2864 
2865 // a * b + c
2866 instruct fmaD_reg(regD a, regD b, regD c) %{
2867   predicate(UseFMA);
2868   match(Set c (FmaD  c (Binary a b)));
2869   format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
2870   ins_cost(150);
2871   ins_encode %{
2872     __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2873   %}
2874   ins_pipe( pipe_slow );
2875 %}
2876 
2877 // a * b + c
2878 instruct fmaF_reg(regF a, regF b, regF c) %{
2879   predicate(UseFMA);
2880   match(Set c (FmaF  c (Binary a b)));
2881   format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
2882   ins_cost(150);
2883   ins_encode %{
2884     __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
2885   %}
2886   ins_pipe( pipe_slow );
2887 %}
2888 
2889 // ====================VECTOR INSTRUCTIONS=====================================
2890 
2891 
2892 // Load vectors (4 bytes long)
2893 instruct loadV4(vecS dst, memory mem) %{
2894   predicate(n->as_LoadVector()->memory_size() == 4);
2895   match(Set dst (LoadVector mem));
2896   ins_cost(125);
2897   format %{ "movd    $dst,$mem\t! load vector (4 bytes)" %}
2898   ins_encode %{
2899     __ movdl($dst$$XMMRegister, $mem$$Address);
2900   %}
2901   ins_pipe( pipe_slow );
2902 %}
2903 
2904 // Load vectors (4 bytes long)
2905 instruct MoveVecS2Leg(legVecS dst, vecS src) %{
2906   match(Set dst src);
2907   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
2908   ins_encode %{
2909     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
2910   %}
2911   ins_pipe( fpu_reg_reg );
2912 %}
2913 
2914 // Load vectors (4 bytes long)
2915 instruct MoveLeg2VecS(vecS dst, legVecS src) %{
2916   match(Set dst src);
2917   format %{ "movss $dst,$src\t! load vector (4 bytes)" %}
2918   ins_encode %{
2919     __ movflt($dst$$XMMRegister, $src$$XMMRegister);
2920   %}
2921   ins_pipe( fpu_reg_reg );
2922 %}
2923 
2924 // Load vectors (8 bytes long)
2925 instruct loadV8(vecD dst, memory mem) %{
2926   predicate(n->as_LoadVector()->memory_size() == 8);
2927   match(Set dst (LoadVector mem));
2928   ins_cost(125);
2929   format %{ "movq    $dst,$mem\t! load vector (8 bytes)" %}
2930   ins_encode %{
2931     __ movq($dst$$XMMRegister, $mem$$Address);
2932   %}
2933   ins_pipe( pipe_slow );
2934 %}
2935 
2936 // Load vectors (8 bytes long)
2937 instruct MoveVecD2Leg(legVecD dst, vecD src) %{
2938   match(Set dst src);
2939   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
2940   ins_encode %{
2941     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
2942   %}
2943   ins_pipe( fpu_reg_reg );
2944 %}
2945 
2946 // Load vectors (8 bytes long)
2947 instruct MoveLeg2VecD(vecD dst, legVecD src) %{
2948   match(Set dst src);
2949   format %{ "movsd $dst,$src\t! load vector (8 bytes)" %}
2950   ins_encode %{
2951     __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
2952   %}
2953   ins_pipe( fpu_reg_reg );
2954 %}
2955 
2956 // Load vectors (16 bytes long)
2957 instruct loadV16(vecX dst, memory mem) %{
2958   predicate(n->as_LoadVector()->memory_size() == 16);
2959   match(Set dst (LoadVector mem));
2960   ins_cost(125);
2961   format %{ "movdqu  $dst,$mem\t! load vector (16 bytes)" %}
2962   ins_encode %{
2963     __ movdqu($dst$$XMMRegister, $mem$$Address);
2964   %}
2965   ins_pipe( pipe_slow );
2966 %}
2967 
2968 // Load vectors (16 bytes long)
2969 instruct MoveVecX2Leg(legVecX dst, vecX src) %{
2970   match(Set dst src);
2971   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
2972   ins_encode %{
2973     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
2974       int vector_len = 2;
2975       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2976     } else {
2977       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2978     }
2979   %}
2980   ins_pipe( fpu_reg_reg );
2981 %}
2982 
2983 // Load vectors (16 bytes long)
2984 instruct MoveLeg2VecX(vecX dst, legVecX src) %{
2985   match(Set dst src);
2986   format %{ "movdqu $dst,$src\t! load vector (16 bytes)" %}
2987   ins_encode %{
2988     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
2989       int vector_len = 2;
2990       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
2991     } else {
2992       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
2993     }
2994   %}
2995   ins_pipe( fpu_reg_reg );
2996 %}
2997 
2998 // Load vectors (32 bytes long)
2999 instruct loadV32(vecY dst, memory mem) %{
3000   predicate(n->as_LoadVector()->memory_size() == 32);
3001   match(Set dst (LoadVector mem));
3002   ins_cost(125);
3003   format %{ "vmovdqu $dst,$mem\t! load vector (32 bytes)" %}
3004   ins_encode %{
3005     __ vmovdqu($dst$$XMMRegister, $mem$$Address);
3006   %}
3007   ins_pipe( pipe_slow );
3008 %}
3009 
3010 // Load vectors (32 bytes long)
3011 instruct MoveVecY2Leg(legVecY dst, vecY src) %{
3012   match(Set dst src);
3013   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
3014   ins_encode %{
3015     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3016       int vector_len = 2;
3017       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3018     } else {
3019       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3020     }
3021   %}
3022   ins_pipe( fpu_reg_reg );
3023 %}
3024 
3025 // Load vectors (32 bytes long)
3026 instruct MoveLeg2VecY(vecY dst, legVecY src) %{
3027   match(Set dst src);
3028   format %{ "vmovdqu $dst,$src\t! load vector (32 bytes)" %}
3029   ins_encode %{
3030     if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3031       int vector_len = 2;
3032       __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3033     } else {
3034       __ vmovdqu($dst$$XMMRegister, $src$$XMMRegister);
3035     }
3036   %}
3037   ins_pipe( fpu_reg_reg );
3038 %}
3039 
3040 // Load vectors (64 bytes long)
3041 instruct loadV64_dword(vecZ dst, memory mem) %{
3042   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() <= 4);
3043   match(Set dst (LoadVector mem));
3044   ins_cost(125);
3045   format %{ "vmovdqul $dst k0,$mem\t! load vector (64 bytes)" %}
3046   ins_encode %{
3047     int vector_len = 2;
3048     __ evmovdqul($dst$$XMMRegister, $mem$$Address, vector_len);
3049   %}
3050   ins_pipe( pipe_slow );
3051 %}
3052 
3053 // Load vectors (64 bytes long)
3054 instruct loadV64_qword(vecZ dst, memory mem) %{
3055   predicate(n->as_LoadVector()->memory_size() == 64 && n->as_LoadVector()->element_size() > 4);
3056   match(Set dst (LoadVector mem));
3057   ins_cost(125);
3058   format %{ "vmovdquq $dst k0,$mem\t! load vector (64 bytes)" %}
3059   ins_encode %{
3060     int vector_len = 2;
3061     __ evmovdquq($dst$$XMMRegister, $mem$$Address, vector_len);
3062   %}
3063   ins_pipe( pipe_slow );
3064 %}
3065 
3066 instruct MoveVecZ2Leg(legVecZ dst, vecZ  src) %{
3067   match(Set dst src);
3068   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3069   ins_encode %{
3070     int vector_len = 2;
3071     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3072   %}
3073   ins_pipe( fpu_reg_reg );
3074 %}
3075 
3076 instruct MoveLeg2VecZ(vecZ dst, legVecZ  src) %{
3077   match(Set dst src);
3078   format %{ "vmovdquq $dst k0,$src\t! Move vector (64 bytes)" %}
3079   ins_encode %{
3080     int vector_len = 2;
3081     __ evmovdquq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
3082   %}
3083   ins_pipe( fpu_reg_reg );
3084 %}
3085 
3086 // Store vectors
3087 instruct storeV4(memory mem, vecS src) %{
3088   predicate(n->as_StoreVector()->memory_size() == 4);
3089   match(Set mem (StoreVector mem src));
3090   ins_cost(145);
3091   format %{ "movd    $mem,$src\t! store vector (4 bytes)" %}
3092   ins_encode %{
3093     __ movdl($mem$$Address, $src$$XMMRegister);
3094   %}
3095   ins_pipe( pipe_slow );
3096 %}
3097 
3098 instruct storeV8(memory mem, vecD src) %{
3099   predicate(n->as_StoreVector()->memory_size() == 8);
3100   match(Set mem (StoreVector mem src));
3101   ins_cost(145);
3102   format %{ "movq    $mem,$src\t! store vector (8 bytes)" %}
3103   ins_encode %{
3104     __ movq($mem$$Address, $src$$XMMRegister);
3105   %}
3106   ins_pipe( pipe_slow );
3107 %}
3108 
3109 instruct storeV16(memory mem, vecX src) %{
3110   predicate(n->as_StoreVector()->memory_size() == 16);
3111   match(Set mem (StoreVector mem src));
3112   ins_cost(145);
3113   format %{ "movdqu  $mem,$src\t! store vector (16 bytes)" %}
3114   ins_encode %{
3115     __ movdqu($mem$$Address, $src$$XMMRegister);
3116   %}
3117   ins_pipe( pipe_slow );
3118 %}
3119 
3120 instruct storeV32(memory mem, vecY src) %{
3121   predicate(n->as_StoreVector()->memory_size() == 32);
3122   match(Set mem (StoreVector mem src));
3123   ins_cost(145);
3124   format %{ "vmovdqu $mem,$src\t! store vector (32 bytes)" %}
3125   ins_encode %{
3126     __ vmovdqu($mem$$Address, $src$$XMMRegister);
3127   %}
3128   ins_pipe( pipe_slow );
3129 %}
3130 
3131 instruct storeV64_dword(memory mem, vecZ src) %{
3132   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() <= 4);
3133   match(Set mem (StoreVector mem src));
3134   ins_cost(145);
3135   format %{ "vmovdqul $mem k0,$src\t! store vector (64 bytes)" %}
3136   ins_encode %{
3137     int vector_len = 2;
3138     __ evmovdqul($mem$$Address, $src$$XMMRegister, vector_len);
3139   %}
3140   ins_pipe( pipe_slow );
3141 %}
3142 
3143 instruct storeV64_qword(memory mem, vecZ src) %{
3144   predicate(n->as_StoreVector()->memory_size() == 64 && n->as_StoreVector()->element_size() > 4);
3145   match(Set mem (StoreVector mem src));
3146   ins_cost(145);
3147   format %{ "vmovdquq $mem k0,$src\t! store vector (64 bytes)" %}
3148   ins_encode %{
3149     int vector_len = 2;
3150     __ evmovdquq($mem$$Address, $src$$XMMRegister, vector_len);
3151   %}
3152   ins_pipe( pipe_slow );
3153 %}
3154 
3155 // ====================LEGACY REPLICATE=======================================
3156 
3157 instruct Repl16B(vecX dst, rRegI src) %{
3158   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3159   match(Set dst (ReplicateB src));
3160   format %{ "movd    $dst,$src\n\t"
3161             "punpcklbw $dst,$dst\n\t"
3162             "pshuflw $dst,$dst,0x00\n\t"
3163             "punpcklqdq $dst,$dst\t! replicate16B" %}
3164   ins_encode %{
3165     __ movdl($dst$$XMMRegister, $src$$Register);
3166     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3167     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3168     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3169   %}
3170   ins_pipe( pipe_slow );
3171 %}
3172 
3173 instruct Repl32B(vecY dst, rRegI src) %{
3174   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3175   match(Set dst (ReplicateB src));
3176   format %{ "movd    $dst,$src\n\t"
3177             "punpcklbw $dst,$dst\n\t"
3178             "pshuflw $dst,$dst,0x00\n\t"
3179             "punpcklqdq $dst,$dst\n\t"
3180             "vinserti128_high $dst,$dst\t! replicate32B" %}
3181   ins_encode %{
3182     __ movdl($dst$$XMMRegister, $src$$Register);
3183     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3184     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3185     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3186     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3187   %}
3188   ins_pipe( pipe_slow );
3189 %}
3190 
3191 instruct Repl64B(legVecZ dst, rRegI src) %{
3192   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3193   match(Set dst (ReplicateB src));
3194   format %{ "movd    $dst,$src\n\t"
3195             "punpcklbw $dst,$dst\n\t"
3196             "pshuflw $dst,$dst,0x00\n\t"
3197             "punpcklqdq $dst,$dst\n\t"
3198             "vinserti128_high $dst,$dst\t"
3199             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B" %}
3200   ins_encode %{
3201     __ movdl($dst$$XMMRegister, $src$$Register);
3202     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3203     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3204     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3205     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3206     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3207   %}
3208   ins_pipe( pipe_slow );
3209 %}
3210 
3211 instruct Repl16B_imm(vecX dst, immI con) %{
3212   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3213   match(Set dst (ReplicateB con));
3214   format %{ "movq    $dst,[$constantaddress]\n\t"
3215             "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
3216   ins_encode %{
3217     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3218     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3219   %}
3220   ins_pipe( pipe_slow );
3221 %}
3222 
3223 instruct Repl32B_imm(vecY dst, immI con) %{
3224   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3225   match(Set dst (ReplicateB con));
3226   format %{ "movq    $dst,[$constantaddress]\n\t"
3227             "punpcklqdq $dst,$dst\n\t"
3228             "vinserti128_high $dst,$dst\t! lreplicate32B($con)" %}
3229   ins_encode %{
3230     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3231     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3232     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3233   %}
3234   ins_pipe( pipe_slow );
3235 %}
3236 
3237 instruct Repl64B_imm(legVecZ dst, immI con) %{
3238   predicate(n->as_Vector()->length() == 64 && !VM_Version::supports_avx512vlbw());
3239   match(Set dst (ReplicateB con));
3240   format %{ "movq    $dst,[$constantaddress]\n\t"
3241             "punpcklqdq $dst,$dst\n\t"
3242             "vinserti128_high $dst,$dst\t"
3243             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate64B($con)" %}
3244   ins_encode %{
3245     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3246     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3247     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3248     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3249   %}
3250   ins_pipe( pipe_slow );
3251 %}
3252 
3253 instruct Repl4S(vecD dst, rRegI src) %{
3254   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
3255   match(Set dst (ReplicateS src));
3256   format %{ "movd    $dst,$src\n\t"
3257             "pshuflw $dst,$dst,0x00\t! replicate4S" %}
3258   ins_encode %{
3259     __ movdl($dst$$XMMRegister, $src$$Register);
3260     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3261   %}
3262   ins_pipe( pipe_slow );
3263 %}
3264 
3265 instruct Repl4S_mem(vecD dst, memory mem) %{
3266   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3267   match(Set dst (ReplicateS (LoadS mem)));
3268   format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
3269   ins_encode %{
3270     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3271   %}
3272   ins_pipe( pipe_slow );
3273 %}
3274 
3275 instruct Repl8S(vecX dst, rRegI src) %{
3276   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3277   match(Set dst (ReplicateS src));
3278   format %{ "movd    $dst,$src\n\t"
3279             "pshuflw $dst,$dst,0x00\n\t"
3280             "punpcklqdq $dst,$dst\t! replicate8S" %}
3281   ins_encode %{
3282     __ movdl($dst$$XMMRegister, $src$$Register);
3283     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3284     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3285   %}
3286   ins_pipe( pipe_slow );
3287 %}
3288 
3289 instruct Repl8S_mem(vecX dst, memory mem) %{
3290   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
3291   match(Set dst (ReplicateS (LoadS mem)));
3292   format %{ "pshuflw $dst,$mem,0x00\n\t"
3293             "punpcklqdq $dst,$dst\t! replicate8S" %}
3294   ins_encode %{
3295     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3296     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3297   %}
3298   ins_pipe( pipe_slow );
3299 %}
3300 
3301 instruct Repl8S_imm(vecX dst, immI con) %{
3302   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
3303   match(Set dst (ReplicateS con));
3304   format %{ "movq    $dst,[$constantaddress]\n\t"
3305             "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
3306   ins_encode %{
3307     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3308     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3309   %}
3310   ins_pipe( pipe_slow );
3311 %}
3312 
3313 instruct Repl16S(vecY dst, rRegI src) %{
3314   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3315   match(Set dst (ReplicateS src));
3316   format %{ "movd    $dst,$src\n\t"
3317             "pshuflw $dst,$dst,0x00\n\t"
3318             "punpcklqdq $dst,$dst\n\t"
3319             "vinserti128_high $dst,$dst\t! replicate16S" %}
3320   ins_encode %{
3321     __ movdl($dst$$XMMRegister, $src$$Register);
3322     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3323     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3324     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3325   %}
3326   ins_pipe( pipe_slow );
3327 %}
3328 
3329 instruct Repl16S_mem(vecY dst, memory mem) %{
3330   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3331   match(Set dst (ReplicateS (LoadS mem)));
3332   format %{ "pshuflw $dst,$mem,0x00\n\t"
3333             "punpcklqdq $dst,$dst\n\t"
3334             "vinserti128_high $dst,$dst\t! replicate16S" %}
3335   ins_encode %{
3336     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3337     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3338     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3339   %}
3340   ins_pipe( pipe_slow );
3341 %}
3342 
3343 instruct Repl16S_imm(vecY dst, immI con) %{
3344   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
3345   match(Set dst (ReplicateS con));
3346   format %{ "movq    $dst,[$constantaddress]\n\t"
3347             "punpcklqdq $dst,$dst\n\t"
3348             "vinserti128_high $dst,$dst\t! replicate16S($con)" %}
3349   ins_encode %{
3350     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3351     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3352     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3353   %}
3354   ins_pipe( pipe_slow );
3355 %}
3356 
3357 instruct Repl32S(legVecZ dst, rRegI src) %{
3358   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3359   match(Set dst (ReplicateS src));
3360   format %{ "movd    $dst,$src\n\t"
3361             "pshuflw $dst,$dst,0x00\n\t"
3362             "punpcklqdq $dst,$dst\n\t"
3363             "vinserti128_high $dst,$dst\t"
3364             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3365   ins_encode %{
3366     __ movdl($dst$$XMMRegister, $src$$Register);
3367     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3368     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3369     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3370     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3371   %}
3372   ins_pipe( pipe_slow );
3373 %}
3374 
3375 instruct Repl32S_mem(legVecZ dst, memory mem) %{
3376   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3377   match(Set dst (ReplicateS (LoadS mem)));
3378   format %{ "pshuflw $dst,$mem,0x00\n\t"
3379             "punpcklqdq $dst,$dst\n\t"
3380             "vinserti128_high $dst,$dst\t"
3381             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S" %}
3382   ins_encode %{
3383     __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
3384     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3385     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3386     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3387   %}
3388   ins_pipe( pipe_slow );
3389 %}
3390 
3391 instruct Repl32S_imm(legVecZ dst, immI con) %{
3392   predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
3393   match(Set dst (ReplicateS con));
3394   format %{ "movq    $dst,[$constantaddress]\n\t"
3395             "punpcklqdq $dst,$dst\n\t"
3396             "vinserti128_high $dst,$dst\t"
3397             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate32S($con)" %}
3398   ins_encode %{
3399     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3400     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3401     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3402     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3403   %}
3404   ins_pipe( pipe_slow );
3405 %}
3406 
3407 instruct Repl4I(vecX dst, rRegI src) %{
3408   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3409   match(Set dst (ReplicateI src));
3410   format %{ "movd    $dst,$src\n\t"
3411             "pshufd  $dst,$dst,0x00\t! replicate4I" %}
3412   ins_encode %{
3413     __ movdl($dst$$XMMRegister, $src$$Register);
3414     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3415   %}
3416   ins_pipe( pipe_slow );
3417 %}
3418 
3419 instruct Repl4I_mem(vecX dst, memory mem) %{
3420   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3421   match(Set dst (ReplicateI (LoadI mem)));
3422   format %{ "pshufd  $dst,$mem,0x00\t! replicate4I" %}
3423   ins_encode %{
3424     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3425   %}
3426   ins_pipe( pipe_slow );
3427 %}
3428 
3429 instruct Repl8I(vecY dst, rRegI src) %{
3430   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3431   match(Set dst (ReplicateI src));
3432   format %{ "movd    $dst,$src\n\t"
3433             "pshufd  $dst,$dst,0x00\n\t"
3434             "vinserti128_high $dst,$dst\t! replicate8I" %}
3435   ins_encode %{
3436     __ movdl($dst$$XMMRegister, $src$$Register);
3437     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3438     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3439   %}
3440   ins_pipe( pipe_slow );
3441 %}
3442 
3443 instruct Repl8I_mem(vecY dst, memory mem) %{
3444   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3445   match(Set dst (ReplicateI (LoadI mem)));
3446   format %{ "pshufd  $dst,$mem,0x00\n\t"
3447             "vinserti128_high $dst,$dst\t! replicate8I" %}
3448   ins_encode %{
3449     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3450     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3451   %}
3452   ins_pipe( pipe_slow );
3453 %}
3454 
3455 instruct Repl16I(legVecZ dst, rRegI src) %{
3456   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3457   match(Set dst (ReplicateI src));
3458   format %{ "movd    $dst,$src\n\t"
3459             "pshufd  $dst,$dst,0x00\n\t"
3460             "vinserti128_high $dst,$dst\t"
3461             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3462   ins_encode %{
3463     __ movdl($dst$$XMMRegister, $src$$Register);
3464     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3465     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3466     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3467   %}
3468   ins_pipe( pipe_slow );
3469 %}
3470 
3471 instruct Repl16I_mem(legVecZ dst, memory mem) %{
3472   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3473   match(Set dst (ReplicateI (LoadI mem)));
3474   format %{ "pshufd  $dst,$mem,0x00\n\t"
3475             "vinserti128_high $dst,$dst\t"
3476             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I" %}
3477   ins_encode %{
3478     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3479     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3480     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3481   %}
3482   ins_pipe( pipe_slow );
3483 %}
3484 
3485 instruct Repl4I_imm(vecX dst, immI con) %{
3486   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3487   match(Set dst (ReplicateI con));
3488   format %{ "movq    $dst,[$constantaddress]\t! replicate4I($con)\n\t"
3489             "punpcklqdq $dst,$dst" %}
3490   ins_encode %{
3491     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3492     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3493   %}
3494   ins_pipe( pipe_slow );
3495 %}
3496 
3497 instruct Repl8I_imm(vecY dst, immI con) %{
3498   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3499   match(Set dst (ReplicateI con));
3500   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
3501             "punpcklqdq $dst,$dst\n\t"
3502             "vinserti128_high $dst,$dst" %}
3503   ins_encode %{
3504     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3505     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3506     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3507   %}
3508   ins_pipe( pipe_slow );
3509 %}
3510 
3511 instruct Repl16I_imm(legVecZ dst, immI con) %{
3512   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3513   match(Set dst (ReplicateI con));
3514   format %{ "movq    $dst,[$constantaddress]\t"
3515             "punpcklqdq $dst,$dst\n\t"
3516             "vinserti128_high $dst,$dst"
3517             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16I($con)" %}
3518   ins_encode %{
3519     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
3520     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3521     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3522     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3523   %}
3524   ins_pipe( pipe_slow );
3525 %}
3526 
3527 // Long could be loaded into xmm register directly from memory.
3528 instruct Repl2L_mem(vecX dst, memory mem) %{
3529   predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
3530   match(Set dst (ReplicateL (LoadL mem)));
3531   format %{ "movq    $dst,$mem\n\t"
3532             "punpcklqdq $dst,$dst\t! replicate2L" %}
3533   ins_encode %{
3534     __ movq($dst$$XMMRegister, $mem$$Address);
3535     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3536   %}
3537   ins_pipe( pipe_slow );
3538 %}
3539 
3540 // Replicate long (8 byte) scalar to be vector
3541 #ifdef _LP64
3542 instruct Repl4L(vecY dst, rRegL src) %{
3543   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3544   match(Set dst (ReplicateL src));
3545   format %{ "movdq   $dst,$src\n\t"
3546             "punpcklqdq $dst,$dst\n\t"
3547             "vinserti128_high $dst,$dst\t! replicate4L" %}
3548   ins_encode %{
3549     __ movdq($dst$$XMMRegister, $src$$Register);
3550     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3551     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3552   %}
3553   ins_pipe( pipe_slow );
3554 %}
3555 
3556 instruct Repl8L(legVecZ dst, rRegL src) %{
3557   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3558   match(Set dst (ReplicateL src));
3559   format %{ "movdq   $dst,$src\n\t"
3560             "punpcklqdq $dst,$dst\n\t"
3561             "vinserti128_high $dst,$dst\t"
3562             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3563   ins_encode %{
3564     __ movdq($dst$$XMMRegister, $src$$Register);
3565     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3566     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3567     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3568   %}
3569   ins_pipe( pipe_slow );
3570 %}
3571 #else // _LP64
3572 instruct Repl4L(vecY dst, eRegL src, vecY tmp) %{
3573   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3574   match(Set dst (ReplicateL src));
3575   effect(TEMP dst, USE src, TEMP tmp);
3576   format %{ "movdl   $dst,$src.lo\n\t"
3577             "movdl   $tmp,$src.hi\n\t"
3578             "punpckldq $dst,$tmp\n\t"
3579             "punpcklqdq $dst,$dst\n\t"
3580             "vinserti128_high $dst,$dst\t! replicate4L" %}
3581   ins_encode %{
3582     __ movdl($dst$$XMMRegister, $src$$Register);
3583     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3584     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3585     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3586     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3587   %}
3588   ins_pipe( pipe_slow );
3589 %}
3590 
3591 instruct Repl8L(legVecZ dst, eRegL src, legVecZ tmp) %{
3592   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3593   match(Set dst (ReplicateL src));
3594   effect(TEMP dst, USE src, TEMP tmp);
3595   format %{ "movdl   $dst,$src.lo\n\t"
3596             "movdl   $tmp,$src.hi\n\t"
3597             "punpckldq $dst,$tmp\n\t"
3598             "punpcklqdq $dst,$dst\n\t"
3599             "vinserti128_high $dst,$dst\t"
3600             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3601   ins_encode %{
3602     __ movdl($dst$$XMMRegister, $src$$Register);
3603     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
3604     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
3605     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3606     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3607     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3608   %}
3609   ins_pipe( pipe_slow );
3610 %}
3611 #endif // _LP64
3612 
3613 instruct Repl4L_imm(vecY dst, immL con) %{
3614   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3615   match(Set dst (ReplicateL con));
3616   format %{ "movq    $dst,[$constantaddress]\n\t"
3617             "punpcklqdq $dst,$dst\n\t"
3618             "vinserti128_high $dst,$dst\t! replicate4L($con)" %}
3619   ins_encode %{
3620     __ movq($dst$$XMMRegister, $constantaddress($con));
3621     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3622     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3623   %}
3624   ins_pipe( pipe_slow );
3625 %}
3626 
3627 instruct Repl8L_imm(legVecZ dst, immL con) %{
3628   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3629   match(Set dst (ReplicateL con));
3630   format %{ "movq    $dst,[$constantaddress]\n\t"
3631             "punpcklqdq $dst,$dst\n\t"
3632             "vinserti128_high $dst,$dst\t"
3633             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L($con)" %}
3634   ins_encode %{
3635     __ movq($dst$$XMMRegister, $constantaddress($con));
3636     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3637     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3638     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3639   %}
3640   ins_pipe( pipe_slow );
3641 %}
3642 
3643 instruct Repl4L_mem(vecY dst, memory mem) %{
3644   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3645   match(Set dst (ReplicateL (LoadL mem)));
3646   format %{ "movq    $dst,$mem\n\t"
3647             "punpcklqdq $dst,$dst\n\t"
3648             "vinserti128_high $dst,$dst\t! replicate4L" %}
3649   ins_encode %{
3650     __ movq($dst$$XMMRegister, $mem$$Address);
3651     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3652     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3653   %}
3654   ins_pipe( pipe_slow );
3655 %}
3656 
3657 instruct Repl8L_mem(legVecZ dst, memory mem) %{
3658   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3659   match(Set dst (ReplicateL (LoadL mem)));
3660   format %{ "movq    $dst,$mem\n\t"
3661             "punpcklqdq $dst,$dst\n\t"
3662             "vinserti128_high $dst,$dst\t"
3663             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8L" %}
3664   ins_encode %{
3665     __ movq($dst$$XMMRegister, $mem$$Address);
3666     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
3667     __ vinserti128_high($dst$$XMMRegister, $dst$$XMMRegister);
3668     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3669   %}
3670   ins_pipe( pipe_slow );
3671 %}
3672 
3673 instruct Repl2F_mem(vecD dst, memory mem) %{
3674   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3675   match(Set dst (ReplicateF (LoadF mem)));
3676   format %{ "pshufd  $dst,$mem,0x00\t! replicate2F" %}
3677   ins_encode %{
3678     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3679   %}
3680   ins_pipe( pipe_slow );
3681 %}
3682 
3683 instruct Repl4F_mem(vecX dst, memory mem) %{
3684   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3685   match(Set dst (ReplicateF (LoadF mem)));
3686   format %{ "pshufd  $dst,$mem,0x00\t! replicate4F" %}
3687   ins_encode %{
3688     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3689   %}
3690   ins_pipe( pipe_slow );
3691 %}
3692 
3693 instruct Repl8F(vecY dst, vlRegF src) %{
3694   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3695   match(Set dst (ReplicateF src));
3696   format %{ "pshufd  $dst,$src,0x00\n\t"
3697             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3698   ins_encode %{
3699     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3700     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3701   %}
3702   ins_pipe( pipe_slow );
3703 %}
3704 
3705 instruct Repl8F_mem(vecY dst, memory mem) %{
3706   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3707   match(Set dst (ReplicateF (LoadF mem)));
3708   format %{ "pshufd  $dst,$mem,0x00\n\t"
3709             "vinsertf128_high $dst,$dst\t! replicate8F" %}
3710   ins_encode %{
3711     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3712     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3713   %}
3714   ins_pipe( pipe_slow );
3715 %}
3716 
3717 instruct Repl16F(legVecZ dst, vlRegF src) %{
3718   predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3719   match(Set dst (ReplicateF src));
3720   format %{ "pshufd  $dst,$src,0x00\n\t"
3721             "vinsertf128_high $dst,$dst\t"
3722             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3723   ins_encode %{
3724     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
3725     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3726     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3727   %}
3728   ins_pipe( pipe_slow );
3729 %}
3730 
3731 instruct Repl16F_mem(legVecZ dst, memory mem) %{
3732   predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vl());
3733   match(Set dst (ReplicateF (LoadF mem)));
3734   format %{ "pshufd  $dst,$mem,0x00\n\t"
3735             "vinsertf128_high $dst,$dst\t"
3736             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate16F" %}
3737   ins_encode %{
3738     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
3739     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3740     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3741   %}
3742   ins_pipe( pipe_slow );
3743 %}
3744 
3745 instruct Repl2F_zero(vecD dst, immF0 zero) %{
3746   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3747   match(Set dst (ReplicateF zero));
3748   format %{ "xorps   $dst,$dst\t! replicate2F zero" %}
3749   ins_encode %{
3750     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3751   %}
3752   ins_pipe( fpu_reg_reg );
3753 %}
3754 
3755 instruct Repl4F_zero(vecX dst, immF0 zero) %{
3756   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3757   match(Set dst (ReplicateF zero));
3758   format %{ "xorps   $dst,$dst\t! replicate4F zero" %}
3759   ins_encode %{
3760     __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
3761   %}
3762   ins_pipe( fpu_reg_reg );
3763 %}
3764 
3765 instruct Repl8F_zero(vecY dst, immF0 zero) %{
3766   predicate(n->as_Vector()->length() == 8 && UseAVX < 3);
3767   match(Set dst (ReplicateF zero));
3768   format %{ "vxorps  $dst,$dst,$dst\t! replicate8F zero" %}
3769   ins_encode %{
3770     int vector_len = 1;
3771     __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3772   %}
3773   ins_pipe( fpu_reg_reg );
3774 %}
3775 
3776 instruct Repl2D_mem(vecX dst, memory mem) %{
3777   predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3778   match(Set dst (ReplicateD (LoadD mem)));
3779   format %{ "pshufd  $dst,$mem,0x44\t! replicate2D" %}
3780   ins_encode %{
3781     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3782   %}
3783   ins_pipe( pipe_slow );
3784 %}
3785 
3786 instruct Repl4D(vecY dst, vlRegD src) %{
3787   predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3788   match(Set dst (ReplicateD src));
3789   format %{ "pshufd  $dst,$src,0x44\n\t"
3790             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3791   ins_encode %{
3792     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3793     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3794   %}
3795   ins_pipe( pipe_slow );
3796 %}
3797 
3798 instruct Repl4D_mem(vecY dst, memory mem) %{
3799   predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
3800   match(Set dst (ReplicateD (LoadD mem)));
3801   format %{ "pshufd  $dst,$mem,0x44\n\t"
3802             "vinsertf128_high $dst,$dst\t! replicate4D" %}
3803   ins_encode %{
3804     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3805     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3806   %}
3807   ins_pipe( pipe_slow );
3808 %}
3809 
3810 instruct Repl8D(legVecZ dst, vlRegD src) %{
3811   predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vl());
3812   match(Set dst (ReplicateD src));
3813   format %{ "pshufd  $dst,$src,0x44\n\t"
3814             "vinsertf128_high $dst,$dst\t"
3815             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3816   ins_encode %{
3817     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
3818     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3819     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3820   %}
3821   ins_pipe( pipe_slow );
3822 %}
3823 
3824 instruct Repl8D_mem(legVecZ dst, memory mem) %{
3825   predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
3826   match(Set dst (ReplicateD (LoadD mem)));
3827   format %{ "pshufd  $dst,$mem,0x44\n\t"
3828             "vinsertf128_high $dst,$dst\t"
3829             "vinserti64x4 $dst,$dst,$dst,0x1\t! replicate8D" %}
3830   ins_encode %{
3831     __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
3832     __ vinsertf128_high($dst$$XMMRegister, $dst$$XMMRegister);
3833     __ vinserti64x4($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, 0x1);
3834   %}
3835   ins_pipe( pipe_slow );
3836 %}
3837 
3838 // Replicate double (8 byte) scalar zero to be vector
3839 instruct Repl2D_zero(vecX dst, immD0 zero) %{
3840   predicate(n->as_Vector()->length() == 2 && UseAVX < 3);
3841   match(Set dst (ReplicateD zero));
3842   format %{ "xorpd   $dst,$dst\t! replicate2D zero" %}
3843   ins_encode %{
3844     __ xorpd($dst$$XMMRegister, $dst$$XMMRegister);
3845   %}
3846   ins_pipe( fpu_reg_reg );
3847 %}
3848 
3849 instruct Repl4D_zero(vecY dst, immD0 zero) %{
3850   predicate(n->as_Vector()->length() == 4 && UseAVX < 3);
3851   match(Set dst (ReplicateD zero));
3852   format %{ "vxorpd  $dst,$dst,$dst,vect256\t! replicate4D zero" %}
3853   ins_encode %{
3854     int vector_len = 1;
3855     __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3856   %}
3857   ins_pipe( fpu_reg_reg );
3858 %}
3859 
3860 // ====================GENERIC REPLICATE==========================================
3861 
3862 // Replicate byte scalar to be vector
3863 instruct Repl4B(vecS dst, rRegI src) %{
3864   predicate(n->as_Vector()->length() == 4);
3865   match(Set dst (ReplicateB src));
3866   format %{ "movd    $dst,$src\n\t"
3867             "punpcklbw $dst,$dst\n\t"
3868             "pshuflw $dst,$dst,0x00\t! replicate4B" %}
3869   ins_encode %{
3870     __ movdl($dst$$XMMRegister, $src$$Register);
3871     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3872     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3873   %}
3874   ins_pipe( pipe_slow );
3875 %}
3876 
3877 instruct Repl8B(vecD dst, rRegI src) %{
3878   predicate(n->as_Vector()->length() == 8);
3879   match(Set dst (ReplicateB src));
3880   format %{ "movd    $dst,$src\n\t"
3881             "punpcklbw $dst,$dst\n\t"
3882             "pshuflw $dst,$dst,0x00\t! replicate8B" %}
3883   ins_encode %{
3884     __ movdl($dst$$XMMRegister, $src$$Register);
3885     __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
3886     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3887   %}
3888   ins_pipe( pipe_slow );
3889 %}
3890 
3891 // Replicate byte scalar immediate to be vector by loading from const table.
3892 instruct Repl4B_imm(vecS dst, immI con) %{
3893   predicate(n->as_Vector()->length() == 4);
3894   match(Set dst (ReplicateB con));
3895   format %{ "movdl   $dst,[$constantaddress]\t! replicate4B($con)" %}
3896   ins_encode %{
3897     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 1)));
3898   %}
3899   ins_pipe( pipe_slow );
3900 %}
3901 
3902 instruct Repl8B_imm(vecD dst, immI con) %{
3903   predicate(n->as_Vector()->length() == 8);
3904   match(Set dst (ReplicateB con));
3905   format %{ "movq    $dst,[$constantaddress]\t! replicate8B($con)" %}
3906   ins_encode %{
3907     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
3908   %}
3909   ins_pipe( pipe_slow );
3910 %}
3911 
3912 // Replicate byte scalar zero to be vector
3913 instruct Repl4B_zero(vecS dst, immI0 zero) %{
3914   predicate(n->as_Vector()->length() == 4);
3915   match(Set dst (ReplicateB zero));
3916   format %{ "pxor    $dst,$dst\t! replicate4B zero" %}
3917   ins_encode %{
3918     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3919   %}
3920   ins_pipe( fpu_reg_reg );
3921 %}
3922 
3923 instruct Repl8B_zero(vecD dst, immI0 zero) %{
3924   predicate(n->as_Vector()->length() == 8);
3925   match(Set dst (ReplicateB zero));
3926   format %{ "pxor    $dst,$dst\t! replicate8B zero" %}
3927   ins_encode %{
3928     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3929   %}
3930   ins_pipe( fpu_reg_reg );
3931 %}
3932 
3933 instruct Repl16B_zero(vecX dst, immI0 zero) %{
3934   predicate(n->as_Vector()->length() == 16);
3935   match(Set dst (ReplicateB zero));
3936   format %{ "pxor    $dst,$dst\t! replicate16B zero" %}
3937   ins_encode %{
3938     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3939   %}
3940   ins_pipe( fpu_reg_reg );
3941 %}
3942 
3943 instruct Repl32B_zero(vecY dst, immI0 zero) %{
3944   predicate(n->as_Vector()->length() == 32);
3945   match(Set dst (ReplicateB zero));
3946   format %{ "vpxor   $dst,$dst,$dst\t! replicate32B zero" %}
3947   ins_encode %{
3948     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
3949     int vector_len = 1;
3950     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
3951   %}
3952   ins_pipe( fpu_reg_reg );
3953 %}
3954 
3955 // Replicate char/short (2 byte) scalar to be vector
3956 instruct Repl2S(vecS dst, rRegI src) %{
3957   predicate(n->as_Vector()->length() == 2);
3958   match(Set dst (ReplicateS src));
3959   format %{ "movd    $dst,$src\n\t"
3960             "pshuflw $dst,$dst,0x00\t! replicate2S" %}
3961   ins_encode %{
3962     __ movdl($dst$$XMMRegister, $src$$Register);
3963     __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
3964   %}
3965   ins_pipe( fpu_reg_reg );
3966 %}
3967 
3968 // Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
3969 instruct Repl2S_imm(vecS dst, immI con) %{
3970   predicate(n->as_Vector()->length() == 2);
3971   match(Set dst (ReplicateS con));
3972   format %{ "movdl   $dst,[$constantaddress]\t! replicate2S($con)" %}
3973   ins_encode %{
3974     __ movdl($dst$$XMMRegister, $constantaddress(replicate4_imm($con$$constant, 2)));
3975   %}
3976   ins_pipe( fpu_reg_reg );
3977 %}
3978 
3979 instruct Repl4S_imm(vecD dst, immI con) %{
3980   predicate(n->as_Vector()->length() == 4);
3981   match(Set dst (ReplicateS con));
3982   format %{ "movq    $dst,[$constantaddress]\t! replicate4S($con)" %}
3983   ins_encode %{
3984     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
3985   %}
3986   ins_pipe( fpu_reg_reg );
3987 %}
3988 
3989 // Replicate char/short (2 byte) scalar zero to be vector
3990 instruct Repl2S_zero(vecS dst, immI0 zero) %{
3991   predicate(n->as_Vector()->length() == 2);
3992   match(Set dst (ReplicateS zero));
3993   format %{ "pxor    $dst,$dst\t! replicate2S zero" %}
3994   ins_encode %{
3995     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
3996   %}
3997   ins_pipe( fpu_reg_reg );
3998 %}
3999 
4000 instruct Repl4S_zero(vecD dst, immI0 zero) %{
4001   predicate(n->as_Vector()->length() == 4);
4002   match(Set dst (ReplicateS zero));
4003   format %{ "pxor    $dst,$dst\t! replicate4S zero" %}
4004   ins_encode %{
4005     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4006   %}
4007   ins_pipe( fpu_reg_reg );
4008 %}
4009 
4010 instruct Repl8S_zero(vecX dst, immI0 zero) %{
4011   predicate(n->as_Vector()->length() == 8);
4012   match(Set dst (ReplicateS zero));
4013   format %{ "pxor    $dst,$dst\t! replicate8S zero" %}
4014   ins_encode %{
4015     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4016   %}
4017   ins_pipe( fpu_reg_reg );
4018 %}
4019 
4020 instruct Repl16S_zero(vecY dst, immI0 zero) %{
4021   predicate(n->as_Vector()->length() == 16);
4022   match(Set dst (ReplicateS zero));
4023   format %{ "vpxor   $dst,$dst,$dst\t! replicate16S zero" %}
4024   ins_encode %{
4025     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4026     int vector_len = 1;
4027     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4028   %}
4029   ins_pipe( fpu_reg_reg );
4030 %}
4031 
4032 // Replicate integer (4 byte) scalar to be vector
4033 instruct Repl2I(vecD dst, rRegI src) %{
4034   predicate(n->as_Vector()->length() == 2);
4035   match(Set dst (ReplicateI src));
4036   format %{ "movd    $dst,$src\n\t"
4037             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4038   ins_encode %{
4039     __ movdl($dst$$XMMRegister, $src$$Register);
4040     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4041   %}
4042   ins_pipe( fpu_reg_reg );
4043 %}
4044 
4045 // Integer could be loaded into xmm register directly from memory.
4046 instruct Repl2I_mem(vecD dst, memory mem) %{
4047   predicate(n->as_Vector()->length() == 2);
4048   match(Set dst (ReplicateI (LoadI mem)));
4049   format %{ "movd    $dst,$mem\n\t"
4050             "pshufd  $dst,$dst,0x00\t! replicate2I" %}
4051   ins_encode %{
4052     __ movdl($dst$$XMMRegister, $mem$$Address);
4053     __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
4054   %}
4055   ins_pipe( fpu_reg_reg );
4056 %}
4057 
4058 // Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
4059 instruct Repl2I_imm(vecD dst, immI con) %{
4060   predicate(n->as_Vector()->length() == 2);
4061   match(Set dst (ReplicateI con));
4062   format %{ "movq    $dst,[$constantaddress]\t! replicate2I($con)" %}
4063   ins_encode %{
4064     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4065   %}
4066   ins_pipe( fpu_reg_reg );
4067 %}
4068 
4069 // Replicate integer (4 byte) scalar zero to be vector
4070 instruct Repl2I_zero(vecD dst, immI0 zero) %{
4071   predicate(n->as_Vector()->length() == 2);
4072   match(Set dst (ReplicateI zero));
4073   format %{ "pxor    $dst,$dst\t! replicate2I" %}
4074   ins_encode %{
4075     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4076   %}
4077   ins_pipe( fpu_reg_reg );
4078 %}
4079 
4080 instruct Repl4I_zero(vecX dst, immI0 zero) %{
4081   predicate(n->as_Vector()->length() == 4);
4082   match(Set dst (ReplicateI zero));
4083   format %{ "pxor    $dst,$dst\t! replicate4I zero)" %}
4084   ins_encode %{
4085     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4086   %}
4087   ins_pipe( fpu_reg_reg );
4088 %}
4089 
4090 instruct Repl8I_zero(vecY dst, immI0 zero) %{
4091   predicate(n->as_Vector()->length() == 8);
4092   match(Set dst (ReplicateI zero));
4093   format %{ "vpxor   $dst,$dst,$dst\t! replicate8I zero" %}
4094   ins_encode %{
4095     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4096     int vector_len = 1;
4097     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4098   %}
4099   ins_pipe( fpu_reg_reg );
4100 %}
4101 
4102 // Replicate long (8 byte) scalar to be vector
4103 #ifdef _LP64
4104 instruct Repl2L(vecX dst, rRegL src) %{
4105   predicate(n->as_Vector()->length() == 2);
4106   match(Set dst (ReplicateL src));
4107   format %{ "movdq   $dst,$src\n\t"
4108             "punpcklqdq $dst,$dst\t! replicate2L" %}
4109   ins_encode %{
4110     __ movdq($dst$$XMMRegister, $src$$Register);
4111     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4112   %}
4113   ins_pipe( pipe_slow );
4114 %}
4115 #else // _LP64
4116 instruct Repl2L(vecX dst, eRegL src, vecX tmp) %{
4117   predicate(n->as_Vector()->length() == 2);
4118   match(Set dst (ReplicateL src));
4119   effect(TEMP dst, USE src, TEMP tmp);
4120   format %{ "movdl   $dst,$src.lo\n\t"
4121             "movdl   $tmp,$src.hi\n\t"
4122             "punpckldq $dst,$tmp\n\t"
4123             "punpcklqdq $dst,$dst\t! replicate2L"%}
4124   ins_encode %{
4125     __ movdl($dst$$XMMRegister, $src$$Register);
4126     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4127     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4128     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4129   %}
4130   ins_pipe( pipe_slow );
4131 %}
4132 #endif // _LP64
4133 
4134 // Replicate long (8 byte) scalar immediate to be vector by loading from const table.
4135 instruct Repl2L_imm(vecX dst, immL con) %{
4136   predicate(n->as_Vector()->length() == 2);
4137   match(Set dst (ReplicateL con));
4138   format %{ "movq    $dst,[$constantaddress]\n\t"
4139             "punpcklqdq $dst,$dst\t! replicate2L($con)" %}
4140   ins_encode %{
4141     __ movq($dst$$XMMRegister, $constantaddress($con));
4142     __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
4143   %}
4144   ins_pipe( pipe_slow );
4145 %}
4146 
4147 // Replicate long (8 byte) scalar zero to be vector
4148 instruct Repl2L_zero(vecX dst, immL0 zero) %{
4149   predicate(n->as_Vector()->length() == 2);
4150   match(Set dst (ReplicateL zero));
4151   format %{ "pxor    $dst,$dst\t! replicate2L zero" %}
4152   ins_encode %{
4153     __ pxor($dst$$XMMRegister, $dst$$XMMRegister);
4154   %}
4155   ins_pipe( fpu_reg_reg );
4156 %}
4157 
4158 instruct Repl4L_zero(vecY dst, immL0 zero) %{
4159   predicate(n->as_Vector()->length() == 4);
4160   match(Set dst (ReplicateL zero));
4161   format %{ "vpxor   $dst,$dst,$dst\t! replicate4L zero" %}
4162   ins_encode %{
4163     // Use vxorpd since AVX does not have vpxor for 256-bit (AVX2 will have it).
4164     int vector_len = 1;
4165     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4166   %}
4167   ins_pipe( fpu_reg_reg );
4168 %}
4169 
4170 // Replicate float (4 byte) scalar to be vector
4171 instruct Repl2F(vecD dst, vlRegF src) %{
4172   predicate(n->as_Vector()->length() == 2);
4173   match(Set dst (ReplicateF src));
4174   format %{ "pshufd  $dst,$dst,0x00\t! replicate2F" %}
4175   ins_encode %{
4176     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4177   %}
4178   ins_pipe( fpu_reg_reg );
4179 %}
4180 
4181 instruct Repl4F(vecX dst, vlRegF src) %{
4182   predicate(n->as_Vector()->length() == 4);
4183   match(Set dst (ReplicateF src));
4184   format %{ "pshufd  $dst,$dst,0x00\t! replicate4F" %}
4185   ins_encode %{
4186     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
4187   %}
4188   ins_pipe( pipe_slow );
4189 %}
4190 
4191 // Replicate double (8 bytes) scalar to be vector
4192 instruct Repl2D(vecX dst, vlRegD src) %{
4193   predicate(n->as_Vector()->length() == 2);
4194   match(Set dst (ReplicateD src));
4195   format %{ "pshufd  $dst,$src,0x44\t! replicate2D" %}
4196   ins_encode %{
4197     __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
4198   %}
4199   ins_pipe( pipe_slow );
4200 %}
4201 
4202 // ====================EVEX REPLICATE=============================================
4203 
4204 instruct Repl4B_mem_evex(vecS dst, memory mem) %{
4205   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4206   match(Set dst (ReplicateB (LoadB mem)));
4207   format %{ "vpbroadcastb  $dst,$mem\t! replicate4B" %}
4208   ins_encode %{
4209     int vector_len = 0;
4210     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4211   %}
4212   ins_pipe( pipe_slow );
4213 %}
4214 
4215 instruct Repl8B_mem_evex(vecD dst, memory mem) %{
4216   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4217   match(Set dst (ReplicateB (LoadB mem)));
4218   format %{ "vpbroadcastb  $dst,$mem\t! replicate8B" %}
4219   ins_encode %{
4220     int vector_len = 0;
4221     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4222   %}
4223   ins_pipe( pipe_slow );
4224 %}
4225 
4226 instruct Repl16B_evex(vecX dst, rRegI src) %{
4227   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4228   match(Set dst (ReplicateB src));
4229   format %{ "evpbroadcastb $dst,$src\t! replicate16B" %}
4230   ins_encode %{
4231    int vector_len = 0;
4232     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4233   %}
4234   ins_pipe( pipe_slow );
4235 %}
4236 
4237 instruct Repl16B_mem_evex(vecX dst, memory mem) %{
4238   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4239   match(Set dst (ReplicateB (LoadB mem)));
4240   format %{ "vpbroadcastb  $dst,$mem\t! replicate16B" %}
4241   ins_encode %{
4242     int vector_len = 0;
4243     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4244   %}
4245   ins_pipe( pipe_slow );
4246 %}
4247 
4248 instruct Repl32B_evex(vecY dst, rRegI src) %{
4249   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4250   match(Set dst (ReplicateB src));
4251   format %{ "evpbroadcastb $dst,$src\t! replicate32B" %}
4252   ins_encode %{
4253    int vector_len = 1;
4254     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4255   %}
4256   ins_pipe( pipe_slow );
4257 %}
4258 
4259 instruct Repl32B_mem_evex(vecY dst, memory mem) %{
4260   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4261   match(Set dst (ReplicateB (LoadB mem)));
4262   format %{ "vpbroadcastb  $dst,$mem\t! replicate32B" %}
4263   ins_encode %{
4264     int vector_len = 1;
4265     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4266   %}
4267   ins_pipe( pipe_slow );
4268 %}
4269 
4270 instruct Repl64B_evex(vecZ dst, rRegI src) %{
4271   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4272   match(Set dst (ReplicateB src));
4273   format %{ "evpbroadcastb $dst,$src\t! upper replicate64B" %}
4274   ins_encode %{
4275    int vector_len = 2;
4276     __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
4277   %}
4278   ins_pipe( pipe_slow );
4279 %}
4280 
4281 instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
4282   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4283   match(Set dst (ReplicateB (LoadB mem)));
4284   format %{ "vpbroadcastb  $dst,$mem\t! replicate64B" %}
4285   ins_encode %{
4286     int vector_len = 2;
4287     __ vpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
4288   %}
4289   ins_pipe( pipe_slow );
4290 %}
4291 
4292 instruct Repl16B_imm_evex(vecX dst, immI con) %{
4293   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4294   match(Set dst (ReplicateB con));
4295   format %{ "movq    $dst,[$constantaddress]\n\t"
4296             "vpbroadcastb $dst,$dst\t! replicate16B" %}
4297   ins_encode %{
4298    int vector_len = 0;
4299     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4300     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4301   %}
4302   ins_pipe( pipe_slow );
4303 %}
4304 
4305 instruct Repl32B_imm_evex(vecY dst, immI con) %{
4306   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4307   match(Set dst (ReplicateB con));
4308   format %{ "movq    $dst,[$constantaddress]\n\t"
4309             "vpbroadcastb $dst,$dst\t! replicate32B" %}
4310   ins_encode %{
4311    int vector_len = 1;
4312     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4313     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4314   %}
4315   ins_pipe( pipe_slow );
4316 %}
4317 
4318 instruct Repl64B_imm_evex(vecZ dst, immI con) %{
4319   predicate(n->as_Vector()->length() == 64 && UseAVX > 2 && VM_Version::supports_avx512bw());
4320   match(Set dst (ReplicateB con));
4321   format %{ "movq    $dst,[$constantaddress]\n\t"
4322             "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
4323   ins_encode %{
4324    int vector_len = 2;
4325     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
4326     __ vpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4327   %}
4328   ins_pipe( pipe_slow );
4329 %}
4330 
4331 instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
4332   predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
4333   match(Set dst (ReplicateB zero));
4334   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate64B zero" %}
4335   ins_encode %{
4336     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4337     int vector_len = 2;
4338     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4339   %}
4340   ins_pipe( fpu_reg_reg );
4341 %}
4342 
4343 instruct Repl4S_evex(vecD dst, rRegI src) %{
4344   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4345   match(Set dst (ReplicateS src));
4346   format %{ "evpbroadcastw $dst,$src\t! replicate4S" %}
4347   ins_encode %{
4348    int vector_len = 0;
4349     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4350   %}
4351   ins_pipe( pipe_slow );
4352 %}
4353 
4354 instruct Repl4S_mem_evex(vecD dst, memory mem) %{
4355   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4356   match(Set dst (ReplicateS (LoadS mem)));
4357   format %{ "vpbroadcastw  $dst,$mem\t! replicate4S" %}
4358   ins_encode %{
4359     int vector_len = 0;
4360     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4361   %}
4362   ins_pipe( pipe_slow );
4363 %}
4364 
4365 instruct Repl8S_evex(vecX dst, rRegI src) %{
4366   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4367   match(Set dst (ReplicateS src));
4368   format %{ "evpbroadcastw $dst,$src\t! replicate8S" %}
4369   ins_encode %{
4370    int vector_len = 0;
4371     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4372   %}
4373   ins_pipe( pipe_slow );
4374 %}
4375 
4376 instruct Repl8S_mem_evex(vecX dst, memory mem) %{
4377   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4378   match(Set dst (ReplicateS (LoadS mem)));
4379   format %{ "vpbroadcastw  $dst,$mem\t! replicate8S" %}
4380   ins_encode %{
4381     int vector_len = 0;
4382     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4383   %}
4384   ins_pipe( pipe_slow );
4385 %}
4386 
4387 instruct Repl16S_evex(vecY dst, rRegI src) %{
4388   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4389   match(Set dst (ReplicateS src));
4390   format %{ "evpbroadcastw $dst,$src\t! replicate16S" %}
4391   ins_encode %{
4392    int vector_len = 1;
4393     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4394   %}
4395   ins_pipe( pipe_slow );
4396 %}
4397 
4398 instruct Repl16S_mem_evex(vecY dst, memory mem) %{
4399   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4400   match(Set dst (ReplicateS (LoadS mem)));
4401   format %{ "vpbroadcastw  $dst,$mem\t! replicate16S" %}
4402   ins_encode %{
4403     int vector_len = 1;
4404     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4405   %}
4406   ins_pipe( pipe_slow );
4407 %}
4408 
4409 instruct Repl32S_evex(vecZ dst, rRegI src) %{
4410   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4411   match(Set dst (ReplicateS src));
4412   format %{ "evpbroadcastw $dst,$src\t! replicate32S" %}
4413   ins_encode %{
4414    int vector_len = 2;
4415     __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
4416   %}
4417   ins_pipe( pipe_slow );
4418 %}
4419 
4420 instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
4421   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4422   match(Set dst (ReplicateS (LoadS mem)));
4423   format %{ "vpbroadcastw  $dst,$mem\t! replicate32S" %}
4424   ins_encode %{
4425     int vector_len = 2;
4426     __ vpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
4427   %}
4428   ins_pipe( pipe_slow );
4429 %}
4430 
4431 instruct Repl8S_imm_evex(vecX dst, immI con) %{
4432   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4433   match(Set dst (ReplicateS con));
4434   format %{ "movq    $dst,[$constantaddress]\n\t"
4435             "vpbroadcastw $dst,$dst\t! replicate8S" %}
4436   ins_encode %{
4437    int vector_len = 0;
4438     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4439     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4440   %}
4441   ins_pipe( pipe_slow );
4442 %}
4443 
4444 instruct Repl16S_imm_evex(vecY dst, immI con) %{
4445   predicate(n->as_Vector()->length() == 16 && UseAVX > 2 && VM_Version::supports_avx512vlbw());
4446   match(Set dst (ReplicateS con));
4447   format %{ "movq    $dst,[$constantaddress]\n\t"
4448             "vpbroadcastw $dst,$dst\t! replicate16S" %}
4449   ins_encode %{
4450    int vector_len = 1;
4451     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4452     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4453   %}
4454   ins_pipe( pipe_slow );
4455 %}
4456 
4457 instruct Repl32S_imm_evex(vecZ dst, immI con) %{
4458   predicate(n->as_Vector()->length() == 32 && UseAVX > 2 && VM_Version::supports_avx512bw());
4459   match(Set dst (ReplicateS con));
4460   format %{ "movq    $dst,[$constantaddress]\n\t"
4461             "vpbroadcastw $dst,$dst\t! replicate32S" %}
4462   ins_encode %{
4463    int vector_len = 2;
4464     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
4465     __ vpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4466   %}
4467   ins_pipe( pipe_slow );
4468 %}
4469 
4470 instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
4471   predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
4472   match(Set dst (ReplicateS zero));
4473   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate32S zero" %}
4474   ins_encode %{
4475     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4476     int vector_len = 2;
4477     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4478   %}
4479   ins_pipe( fpu_reg_reg );
4480 %}
4481 
4482 instruct Repl4I_evex(vecX dst, rRegI src) %{
4483   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4484   match(Set dst (ReplicateI src));
4485   format %{ "evpbroadcastd  $dst,$src\t! replicate4I" %}
4486   ins_encode %{
4487     int vector_len = 0;
4488     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4489   %}
4490   ins_pipe( pipe_slow );
4491 %}
4492 
4493 instruct Repl4I_mem_evex(vecX dst, memory mem) %{
4494   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4495   match(Set dst (ReplicateI (LoadI mem)));
4496   format %{ "vpbroadcastd  $dst,$mem\t! replicate4I" %}
4497   ins_encode %{
4498     int vector_len = 0;
4499     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4500   %}
4501   ins_pipe( pipe_slow );
4502 %}
4503 
4504 instruct Repl8I_evex(vecY dst, rRegI src) %{
4505   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4506   match(Set dst (ReplicateI src));
4507   format %{ "evpbroadcastd  $dst,$src\t! replicate8I" %}
4508   ins_encode %{
4509     int vector_len = 1;
4510     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4511   %}
4512   ins_pipe( pipe_slow );
4513 %}
4514 
4515 instruct Repl8I_mem_evex(vecY dst, memory mem) %{
4516   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4517   match(Set dst (ReplicateI (LoadI mem)));
4518   format %{ "vpbroadcastd  $dst,$mem\t! replicate8I" %}
4519   ins_encode %{
4520     int vector_len = 1;
4521     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4522   %}
4523   ins_pipe( pipe_slow );
4524 %}
4525 
4526 instruct Repl16I_evex(vecZ dst, rRegI src) %{
4527   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4528   match(Set dst (ReplicateI src));
4529   format %{ "evpbroadcastd  $dst,$src\t! replicate16I" %}
4530   ins_encode %{
4531     int vector_len = 2;
4532     __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
4533   %}
4534   ins_pipe( pipe_slow );
4535 %}
4536 
4537 instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
4538   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4539   match(Set dst (ReplicateI (LoadI mem)));
4540   format %{ "vpbroadcastd  $dst,$mem\t! replicate16I" %}
4541   ins_encode %{
4542     int vector_len = 2;
4543     __ vpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
4544   %}
4545   ins_pipe( pipe_slow );
4546 %}
4547 
4548 instruct Repl4I_imm_evex(vecX dst, immI con) %{
4549   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4550   match(Set dst (ReplicateI con));
4551   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4552             "vpbroadcastd  $dst,$dst\t! replicate4I" %}
4553   ins_encode %{
4554     int vector_len = 0;
4555     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4556     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4557   %}
4558   ins_pipe( pipe_slow );
4559 %}
4560 
4561 instruct Repl8I_imm_evex(vecY dst, immI con) %{
4562   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4563   match(Set dst (ReplicateI con));
4564   format %{ "movq    $dst,[$constantaddress]\t! replicate8I($con)\n\t"
4565             "vpbroadcastd  $dst,$dst\t! replicate8I" %}
4566   ins_encode %{
4567     int vector_len = 1;
4568     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4569     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4570   %}
4571   ins_pipe( pipe_slow );
4572 %}
4573 
4574 instruct Repl16I_imm_evex(vecZ dst, immI con) %{
4575   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4576   match(Set dst (ReplicateI con));
4577   format %{ "movq    $dst,[$constantaddress]\t! replicate16I($con)\n\t"
4578             "vpbroadcastd  $dst,$dst\t! replicate16I" %}
4579   ins_encode %{
4580     int vector_len = 2;
4581     __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
4582     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4583   %}
4584   ins_pipe( pipe_slow );
4585 %}
4586 
4587 instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
4588   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4589   match(Set dst (ReplicateI zero));
4590   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate16I zero" %}
4591   ins_encode %{
4592     // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
4593     int vector_len = 2;
4594     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4595   %}
4596   ins_pipe( fpu_reg_reg );
4597 %}
4598 
4599 // Replicate long (8 byte) scalar to be vector
4600 #ifdef _LP64
4601 instruct Repl4L_evex(vecY dst, rRegL src) %{
4602   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4603   match(Set dst (ReplicateL src));
4604   format %{ "evpbroadcastq  $dst,$src\t! replicate4L" %}
4605   ins_encode %{
4606     int vector_len = 1;
4607     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4608   %}
4609   ins_pipe( pipe_slow );
4610 %}
4611 
4612 instruct Repl8L_evex(vecZ dst, rRegL src) %{
4613   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4614   match(Set dst (ReplicateL src));
4615   format %{ "evpbroadcastq  $dst,$src\t! replicate8L" %}
4616   ins_encode %{
4617     int vector_len = 2;
4618     __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
4619   %}
4620   ins_pipe( pipe_slow );
4621 %}
4622 #else // _LP64
4623 instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
4624   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4625   match(Set dst (ReplicateL src));
4626   effect(TEMP dst, USE src, TEMP tmp);
4627   format %{ "movdl   $dst,$src.lo\n\t"
4628             "movdl   $tmp,$src.hi\n\t"
4629             "punpckldq $dst,$tmp\n\t"
4630             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4631   ins_encode %{
4632     int vector_len = 1;
4633     __ movdl($dst$$XMMRegister, $src$$Register);
4634     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4635     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4636     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4637   %}
4638   ins_pipe( pipe_slow );
4639 %}
4640 
4641 instruct Repl8L_evex(legVecZ dst, eRegL src, legVecZ tmp) %{
4642   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4643   match(Set dst (ReplicateL src));
4644   effect(TEMP dst, USE src, TEMP tmp);
4645   format %{ "movdl   $dst,$src.lo\n\t"
4646             "movdl   $tmp,$src.hi\n\t"
4647             "punpckldq $dst,$tmp\n\t"
4648             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4649   ins_encode %{
4650     int vector_len = 2;
4651     __ movdl($dst$$XMMRegister, $src$$Register);
4652     __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
4653     __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
4654     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4655   %}
4656   ins_pipe( pipe_slow );
4657 %}
4658 #endif // _LP64
4659 
4660 instruct Repl4L_imm_evex(vecY dst, immL con) %{
4661   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4662   match(Set dst (ReplicateL con));
4663   format %{ "movq    $dst,[$constantaddress]\n\t"
4664             "vpbroadcastq  $dst,$dst\t! replicate4L" %}
4665   ins_encode %{
4666     int vector_len = 1;
4667     __ movq($dst$$XMMRegister, $constantaddress($con));
4668     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4669   %}
4670   ins_pipe( pipe_slow );
4671 %}
4672 
4673 instruct Repl8L_imm_evex(vecZ dst, immL con) %{
4674   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4675   match(Set dst (ReplicateL con));
4676   format %{ "movq    $dst,[$constantaddress]\n\t"
4677             "vpbroadcastq  $dst,$dst\t! replicate8L" %}
4678   ins_encode %{
4679     int vector_len = 2;
4680     __ movq($dst$$XMMRegister, $constantaddress($con));
4681     __ vpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4682   %}
4683   ins_pipe( pipe_slow );
4684 %}
4685 
4686 instruct Repl2L_mem_evex(vecX dst, memory mem) %{
4687   predicate(n->as_Vector()->length() == 2 && UseAVX > 2 && VM_Version::supports_avx512vl());
4688   match(Set dst (ReplicateL (LoadL mem)));
4689   format %{ "vpbroadcastd  $dst,$mem\t! replicate2L" %}
4690   ins_encode %{
4691     int vector_len = 0;
4692     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4693   %}
4694   ins_pipe( pipe_slow );
4695 %}
4696 
4697 instruct Repl4L_mem_evex(vecY dst, memory mem) %{
4698   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4699   match(Set dst (ReplicateL (LoadL mem)));
4700   format %{ "vpbroadcastd  $dst,$mem\t! replicate4L" %}
4701   ins_encode %{
4702     int vector_len = 1;
4703     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4704   %}
4705   ins_pipe( pipe_slow );
4706 %}
4707 
4708 instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
4709   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4710   match(Set dst (ReplicateL (LoadL mem)));
4711   format %{ "vpbroadcastd  $dst,$mem\t! replicate8L" %}
4712   ins_encode %{
4713     int vector_len = 2;
4714     __ vpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
4715   %}
4716   ins_pipe( pipe_slow );
4717 %}
4718 
4719 instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
4720   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4721   match(Set dst (ReplicateL zero));
4722   format %{ "vpxor   $dst k0,$dst,$dst\t! replicate8L zero" %}
4723   ins_encode %{
4724     // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
4725     int vector_len = 2;
4726     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4727   %}
4728   ins_pipe( fpu_reg_reg );
4729 %}
4730 
4731 instruct Repl8F_evex(vecY dst, regF src) %{
4732   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4733   match(Set dst (ReplicateF src));
4734   format %{ "vpbroadcastss $dst,$src\t! replicate8F" %}
4735   ins_encode %{
4736     int vector_len = 1;
4737     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4738   %}
4739   ins_pipe( pipe_slow );
4740 %}
4741 
4742 instruct Repl8F_mem_evex(vecY dst, memory mem) %{
4743   predicate(n->as_Vector()->length() == 8 && UseAVX > 2 && VM_Version::supports_avx512vl());
4744   match(Set dst (ReplicateF (LoadF mem)));
4745   format %{ "vbroadcastss  $dst,$mem\t! replicate8F" %}
4746   ins_encode %{
4747     int vector_len = 1;
4748     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4749   %}
4750   ins_pipe( pipe_slow );
4751 %}
4752 
4753 instruct Repl16F_evex(vecZ dst, regF src) %{
4754   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4755   match(Set dst (ReplicateF src));
4756   format %{ "vpbroadcastss $dst,$src\t! replicate16F" %}
4757   ins_encode %{
4758     int vector_len = 2;
4759     __ vpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4760   %}
4761   ins_pipe( pipe_slow );
4762 %}
4763 
4764 instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
4765   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4766   match(Set dst (ReplicateF (LoadF mem)));
4767   format %{ "vbroadcastss  $dst,$mem\t! replicate16F" %}
4768   ins_encode %{
4769     int vector_len = 2;
4770     __ vpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
4771   %}
4772   ins_pipe( pipe_slow );
4773 %}
4774 
4775 instruct Repl2F_zero_evex(vecD dst, immF0 zero) %{
4776   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4777   match(Set dst (ReplicateF zero));
4778   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2F zero" %}
4779   ins_encode %{
4780     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4781     int vector_len = 2;
4782     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4783   %}
4784   ins_pipe( fpu_reg_reg );
4785 %}
4786 
4787 instruct Repl4F_zero_evex(vecX dst, immF0 zero) %{
4788   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4789   match(Set dst (ReplicateF zero));
4790   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4F zero" %}
4791   ins_encode %{
4792     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4793     int vector_len = 2;
4794     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4795   %}
4796   ins_pipe( fpu_reg_reg );
4797 %}
4798 
4799 instruct Repl8F_zero_evex(vecY dst, immF0 zero) %{
4800   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4801   match(Set dst (ReplicateF zero));
4802   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate8F zero" %}
4803   ins_encode %{
4804     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4805     int vector_len = 2;
4806     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4807   %}
4808   ins_pipe( fpu_reg_reg );
4809 %}
4810 
4811 instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
4812   predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
4813   match(Set dst (ReplicateF zero));
4814   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate16F zero" %}
4815   ins_encode %{
4816     // Use vpxor in place of vxorps since EVEX has a constriant on dq for vxorps: this is a 512-bit operation
4817     int vector_len = 2;
4818     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4819   %}
4820   ins_pipe( fpu_reg_reg );
4821 %}
4822 
4823 instruct Repl4D_evex(vecY dst, regD src) %{
4824   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4825   match(Set dst (ReplicateD src));
4826   format %{ "vpbroadcastsd $dst,$src\t! replicate4D" %}
4827   ins_encode %{
4828     int vector_len = 1;
4829     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4830   %}
4831   ins_pipe( pipe_slow );
4832 %}
4833 
4834 instruct Repl4D_mem_evex(vecY dst, memory mem) %{
4835   predicate(n->as_Vector()->length() == 4 && UseAVX > 2 && VM_Version::supports_avx512vl());
4836   match(Set dst (ReplicateD (LoadD mem)));
4837   format %{ "vbroadcastsd  $dst,$mem\t! replicate4D" %}
4838   ins_encode %{
4839     int vector_len = 1;
4840     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4841   %}
4842   ins_pipe( pipe_slow );
4843 %}
4844 
4845 instruct Repl8D_evex(vecZ dst, regD src) %{
4846   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4847   match(Set dst (ReplicateD src));
4848   format %{ "vpbroadcastsd $dst,$src\t! replicate8D" %}
4849   ins_encode %{
4850     int vector_len = 2;
4851     __ vpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
4852   %}
4853   ins_pipe( pipe_slow );
4854 %}
4855 
4856 instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
4857   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4858   match(Set dst (ReplicateD (LoadD mem)));
4859   format %{ "vbroadcastsd  $dst,$mem\t! replicate8D" %}
4860   ins_encode %{
4861     int vector_len = 2;
4862     __ vpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
4863   %}
4864   ins_pipe( pipe_slow );
4865 %}
4866 
4867 instruct Repl2D_zero_evex(vecX dst, immD0 zero) %{
4868   predicate(n->as_Vector()->length() == 2 && UseAVX > 2);
4869   match(Set dst (ReplicateD zero));
4870   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate2D zero" %}
4871   ins_encode %{
4872     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4873     int vector_len = 2;
4874     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4875   %}
4876   ins_pipe( fpu_reg_reg );
4877 %}
4878 
4879 instruct Repl4D_zero_evex(vecY dst, immD0 zero) %{
4880   predicate(n->as_Vector()->length() == 4 && UseAVX > 2);
4881   match(Set dst (ReplicateD zero));
4882   format %{ "vpxor  $dst k0,$dst,$dst\t! replicate4D zero" %}
4883   ins_encode %{
4884     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4885     int vector_len = 2;
4886     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4887   %}
4888   ins_pipe( fpu_reg_reg );
4889 %}
4890 
4891 instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
4892   predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
4893   match(Set dst (ReplicateD zero));
4894   format %{ "vpxor  $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
4895   ins_encode %{
4896     // Use vpxor in place of vxorpd since EVEX has a constriant on dq for vxorpd: this is a 512-bit operation
4897     int vector_len = 2;
4898     __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
4899   %}
4900   ins_pipe( fpu_reg_reg );
4901 %}
4902 
4903 // ====================REDUCTION ARITHMETIC=======================================
4904 
4905 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4906   predicate(UseSSE > 2 && UseAVX == 0);
4907   match(Set dst (AddReductionVI src1 src2));
4908   effect(TEMP tmp2, TEMP tmp);
4909   format %{ "movdqu  $tmp2,$src2\n\t"
4910             "phaddd  $tmp2,$tmp2\n\t"
4911             "movd    $tmp,$src1\n\t"
4912             "paddd   $tmp,$tmp2\n\t"
4913             "movd    $dst,$tmp\t! add reduction2I" %}
4914   ins_encode %{
4915     __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
4916     __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
4917     __ movdl($tmp$$XMMRegister, $src1$$Register);
4918     __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
4919     __ movdl($dst$$Register, $tmp$$XMMRegister);
4920   %}
4921   ins_pipe( pipe_slow );
4922 %}
4923 
4924 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4925   predicate(VM_Version::supports_avxonly());
4926   match(Set dst (AddReductionVI src1 src2));
4927   effect(TEMP tmp, TEMP tmp2);
4928   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4929             "movd     $tmp2,$src1\n\t"
4930             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4931             "movd     $dst,$tmp2\t! add reduction2I" %}
4932   ins_encode %{
4933     int vector_len = 0;
4934     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4935     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4936     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4937     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4938   %}
4939   ins_pipe( pipe_slow );
4940 %}
4941 
4942 instruct rvadd2I_reduction_reg_evex(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
4943   predicate(UseAVX > 2);
4944   match(Set dst (AddReductionVI src1 src2));
4945   effect(TEMP tmp, TEMP tmp2);
4946   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
4947             "vpaddd  $tmp,$src2,$tmp2\n\t"
4948             "movd    $tmp2,$src1\n\t"
4949             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
4950             "movd    $dst,$tmp2\t! add reduction2I" %}
4951   ins_encode %{
4952     int vector_len = 0;
4953     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
4954     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4955     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4956     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
4957     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4958   %}
4959   ins_pipe( pipe_slow );
4960 %}
4961 
4962 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
4963   predicate(UseSSE > 2 && UseAVX == 0);
4964   match(Set dst (AddReductionVI src1 src2));
4965   effect(TEMP tmp, TEMP tmp2);
4966   format %{ "movdqu  $tmp,$src2\n\t"
4967             "phaddd  $tmp,$tmp\n\t"
4968             "phaddd  $tmp,$tmp\n\t"
4969             "movd    $tmp2,$src1\n\t"
4970             "paddd   $tmp2,$tmp\n\t"
4971             "movd    $dst,$tmp2\t! add reduction4I" %}
4972   ins_encode %{
4973     __ movdqu($tmp$$XMMRegister, $src2$$XMMRegister);
4974     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4975     __ phaddd($tmp$$XMMRegister, $tmp$$XMMRegister);
4976     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4977     __ paddd($tmp2$$XMMRegister, $tmp$$XMMRegister);
4978     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4979   %}
4980   ins_pipe( pipe_slow );
4981 %}
4982 
4983 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
4984   predicate(VM_Version::supports_avxonly());
4985   match(Set dst (AddReductionVI src1 src2));
4986   effect(TEMP tmp, TEMP tmp2);
4987   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
4988             "vphaddd  $tmp,$tmp,$tmp\n\t"
4989             "movd     $tmp2,$src1\n\t"
4990             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
4991             "movd     $dst,$tmp2\t! add reduction4I" %}
4992   ins_encode %{
4993     int vector_len = 0;
4994     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
4995     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
4996     __ movdl($tmp2$$XMMRegister, $src1$$Register);
4997     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, vector_len);
4998     __ movdl($dst$$Register, $tmp2$$XMMRegister);
4999   %}
5000   ins_pipe( pipe_slow );
5001 %}
5002 
5003 instruct rvadd4I_reduction_reg_evex(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5004   predicate(UseAVX > 2);
5005   match(Set dst (AddReductionVI src1 src2));
5006   effect(TEMP tmp, TEMP tmp2);
5007   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5008             "vpaddd  $tmp,$src2,$tmp2\n\t"
5009             "pshufd  $tmp2,$tmp,0x1\n\t"
5010             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5011             "movd    $tmp2,$src1\n\t"
5012             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5013             "movd    $dst,$tmp2\t! add reduction4I" %}
5014   ins_encode %{
5015     int vector_len = 0;
5016     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5017     __ vpaddd($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5018     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5019     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5020     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5021     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5022     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5023   %}
5024   ins_pipe( pipe_slow );
5025 %}
5026 
5027 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5028   predicate(VM_Version::supports_avxonly());
5029   match(Set dst (AddReductionVI src1 src2));
5030   effect(TEMP tmp, TEMP tmp2);
5031   format %{ "vphaddd  $tmp,$src2,$src2\n\t"
5032             "vphaddd  $tmp,$tmp,$tmp2\n\t"
5033             "vextracti128_high  $tmp2,$tmp\n\t"
5034             "vpaddd   $tmp,$tmp,$tmp2\n\t"
5035             "movd     $tmp2,$src1\n\t"
5036             "vpaddd   $tmp2,$tmp2,$tmp\n\t"
5037             "movd     $dst,$tmp2\t! add reduction8I" %}
5038   ins_encode %{
5039     int vector_len = 1;
5040     __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, vector_len);
5041     __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5042     __ vextracti128_high($tmp2$$XMMRegister, $tmp$$XMMRegister);
5043     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5044     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5045     __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5046     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5047   %}
5048   ins_pipe( pipe_slow );
5049 %}
5050 
5051 instruct rvadd8I_reduction_reg_evex(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5052   predicate(UseAVX > 2);
5053   match(Set dst (AddReductionVI src1 src2));
5054   effect(TEMP tmp, TEMP tmp2);
5055   format %{ "vextracti128_high  $tmp,$src2\n\t"
5056             "vpaddd  $tmp,$tmp,$src2\n\t"
5057             "pshufd  $tmp2,$tmp,0xE\n\t"
5058             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5059             "pshufd  $tmp2,$tmp,0x1\n\t"
5060             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5061             "movd    $tmp2,$src1\n\t"
5062             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5063             "movd    $dst,$tmp2\t! add reduction8I" %}
5064   ins_encode %{
5065     int vector_len = 0;
5066     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5067     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5068     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5069     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5070     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5071     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5072     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5073     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5074     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5075   %}
5076   ins_pipe( pipe_slow );
5077 %}
5078 
5079 instruct rvadd16I_reduction_reg_evex(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5080   predicate(UseAVX > 2);
5081   match(Set dst (AddReductionVI src1 src2));
5082   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5083   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5084             "vpaddd  $tmp3,$tmp3,$src2\n\t"
5085             "vextracti128_high  $tmp,$tmp3\n\t"
5086             "vpaddd  $tmp,$tmp,$tmp3\n\t"
5087             "pshufd  $tmp2,$tmp,0xE\n\t"
5088             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5089             "pshufd  $tmp2,$tmp,0x1\n\t"
5090             "vpaddd  $tmp,$tmp,$tmp2\n\t"
5091             "movd    $tmp2,$src1\n\t"
5092             "vpaddd  $tmp2,$tmp,$tmp2\n\t"
5093             "movd    $dst,$tmp2\t! mul reduction16I" %}
5094   ins_encode %{
5095     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5096     __ vpaddd($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5097     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5098     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5099     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5100     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5101     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5102     __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5103     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5104     __ vpaddd($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5105     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5106   %}
5107   ins_pipe( pipe_slow );
5108 %}
5109 
5110 #ifdef _LP64
5111 instruct rvadd2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5112   predicate(UseAVX > 2);
5113   match(Set dst (AddReductionVL src1 src2));
5114   effect(TEMP tmp, TEMP tmp2);
5115   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5116             "vpaddq  $tmp,$src2,$tmp2\n\t"
5117             "movdq   $tmp2,$src1\n\t"
5118             "vpaddq  $tmp2,$tmp,$tmp2\n\t"
5119             "movdq   $dst,$tmp2\t! add reduction2L" %}
5120   ins_encode %{
5121     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5122     __ vpaddq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5123     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5124     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5125     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5126   %}
5127   ins_pipe( pipe_slow );
5128 %}
5129 
5130 instruct rvadd4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5131   predicate(UseAVX > 2);
5132   match(Set dst (AddReductionVL src1 src2));
5133   effect(TEMP tmp, TEMP tmp2);
5134   format %{ "vextracti128_high  $tmp,$src2\n\t"
5135             "vpaddq  $tmp2,$tmp,$src2\n\t"
5136             "pshufd  $tmp,$tmp2,0xE\n\t"
5137             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5138             "movdq   $tmp,$src1\n\t"
5139             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5140             "movdq   $dst,$tmp2\t! add reduction4L" %}
5141   ins_encode %{
5142     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5143     __ vpaddq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5144     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5145     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5146     __ movdq($tmp$$XMMRegister, $src1$$Register);
5147     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5148     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5149   %}
5150   ins_pipe( pipe_slow );
5151 %}
5152 
5153 instruct rvadd8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5154   predicate(UseAVX > 2);
5155   match(Set dst (AddReductionVL src1 src2));
5156   effect(TEMP tmp, TEMP tmp2);
5157   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5158             "vpaddq  $tmp2,$tmp2,$src2\n\t"
5159             "vextracti128_high  $tmp,$tmp2\n\t"
5160             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5161             "pshufd  $tmp,$tmp2,0xE\n\t"
5162             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5163             "movdq   $tmp,$src1\n\t"
5164             "vpaddq  $tmp2,$tmp2,$tmp\n\t"
5165             "movdq   $dst,$tmp2\t! add reduction8L" %}
5166   ins_encode %{
5167     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5168     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5169     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5170     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5171     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5172     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5173     __ movdq($tmp$$XMMRegister, $src1$$Register);
5174     __ vpaddq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5175     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5176   %}
5177   ins_pipe( pipe_slow );
5178 %}
5179 #endif
5180 
5181 instruct rsadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5182   predicate(UseSSE >= 1 && UseAVX == 0);
5183   match(Set dst (AddReductionVF dst src2));
5184   effect(TEMP dst, TEMP tmp);
5185   format %{ "addss   $dst,$src2\n\t"
5186             "pshufd  $tmp,$src2,0x01\n\t"
5187             "addss   $dst,$tmp\t! add reduction2F" %}
5188   ins_encode %{
5189     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5190     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5191     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5192   %}
5193   ins_pipe( pipe_slow );
5194 %}
5195 
5196 instruct rvadd2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5197   predicate(UseAVX > 0);
5198   match(Set dst (AddReductionVF dst src2));
5199   effect(TEMP dst, TEMP tmp);
5200   format %{ "vaddss  $dst,$dst,$src2\n\t"
5201             "pshufd  $tmp,$src2,0x01\n\t"
5202             "vaddss  $dst,$dst,$tmp\t! add reduction2F" %}
5203   ins_encode %{
5204     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5205     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5206     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5207   %}
5208   ins_pipe( pipe_slow );
5209 %}
5210 
5211 instruct rsadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5212   predicate(UseSSE >= 1 && UseAVX == 0);
5213   match(Set dst (AddReductionVF dst src2));
5214   effect(TEMP dst, TEMP tmp);
5215   format %{ "addss   $dst,$src2\n\t"
5216             "pshufd  $tmp,$src2,0x01\n\t"
5217             "addss   $dst,$tmp\n\t"
5218             "pshufd  $tmp,$src2,0x02\n\t"
5219             "addss   $dst,$tmp\n\t"
5220             "pshufd  $tmp,$src2,0x03\n\t"
5221             "addss   $dst,$tmp\t! add reduction4F" %}
5222   ins_encode %{
5223     __ addss($dst$$XMMRegister, $src2$$XMMRegister);
5224     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5225     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5226     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5227     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5228     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5229     __ addss($dst$$XMMRegister, $tmp$$XMMRegister);
5230   %}
5231   ins_pipe( pipe_slow );
5232 %}
5233 
5234 instruct rvadd4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5235   predicate(UseAVX > 0);
5236   match(Set dst (AddReductionVF dst src2));
5237   effect(TEMP tmp, TEMP dst);
5238   format %{ "vaddss  $dst,dst,$src2\n\t"
5239             "pshufd  $tmp,$src2,0x01\n\t"
5240             "vaddss  $dst,$dst,$tmp\n\t"
5241             "pshufd  $tmp,$src2,0x02\n\t"
5242             "vaddss  $dst,$dst,$tmp\n\t"
5243             "pshufd  $tmp,$src2,0x03\n\t"
5244             "vaddss  $dst,$dst,$tmp\t! add reduction4F" %}
5245   ins_encode %{
5246     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5247     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5248     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5249     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5250     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5251     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5252     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5253   %}
5254   ins_pipe( pipe_slow );
5255 %}
5256 
5257 instruct radd8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5258   predicate(UseAVX > 0);
5259   match(Set dst (AddReductionVF dst src2));
5260   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5261   format %{ "vaddss  $dst,$dst,$src2\n\t"
5262             "pshufd  $tmp,$src2,0x01\n\t"
5263             "vaddss  $dst,$dst,$tmp\n\t"
5264             "pshufd  $tmp,$src2,0x02\n\t"
5265             "vaddss  $dst,$dst,$tmp\n\t"
5266             "pshufd  $tmp,$src2,0x03\n\t"
5267             "vaddss  $dst,$dst,$tmp\n\t"
5268             "vextractf128_high  $tmp2,$src2\n\t"
5269             "vaddss  $dst,$dst,$tmp2\n\t"
5270             "pshufd  $tmp,$tmp2,0x01\n\t"
5271             "vaddss  $dst,$dst,$tmp\n\t"
5272             "pshufd  $tmp,$tmp2,0x02\n\t"
5273             "vaddss  $dst,$dst,$tmp\n\t"
5274             "pshufd  $tmp,$tmp2,0x03\n\t"
5275             "vaddss  $dst,$dst,$tmp\t! add reduction8F" %}
5276   ins_encode %{
5277     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5278     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5279     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5280     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5281     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5282     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5283     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5284     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5285     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5286     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5287     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5288     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5289     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5290     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5291     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5292   %}
5293   ins_pipe( pipe_slow );
5294 %}
5295 
5296 instruct radd16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5297   predicate(UseAVX > 2);
5298   match(Set dst (AddReductionVF dst src2));
5299   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5300   format %{ "vaddss  $dst,$dst,$src2\n\t"
5301             "pshufd  $tmp,$src2,0x01\n\t"
5302             "vaddss  $dst,$dst,$tmp\n\t"
5303             "pshufd  $tmp,$src2,0x02\n\t"
5304             "vaddss  $dst,$dst,$tmp\n\t"
5305             "pshufd  $tmp,$src2,0x03\n\t"
5306             "vaddss  $dst,$dst,$tmp\n\t"
5307             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5308             "vaddss  $dst,$dst,$tmp2\n\t"
5309             "pshufd  $tmp,$tmp2,0x01\n\t"
5310             "vaddss  $dst,$dst,$tmp\n\t"
5311             "pshufd  $tmp,$tmp2,0x02\n\t"
5312             "vaddss  $dst,$dst,$tmp\n\t"
5313             "pshufd  $tmp,$tmp2,0x03\n\t"
5314             "vaddss  $dst,$dst,$tmp\n\t"
5315             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5316             "vaddss  $dst,$dst,$tmp2\n\t"
5317             "pshufd  $tmp,$tmp2,0x01\n\t"
5318             "vaddss  $dst,$dst,$tmp\n\t"
5319             "pshufd  $tmp,$tmp2,0x02\n\t"
5320             "vaddss  $dst,$dst,$tmp\n\t"
5321             "pshufd  $tmp,$tmp2,0x03\n\t"
5322             "vaddss  $dst,$dst,$tmp\n\t"
5323             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5324             "vaddss  $dst,$dst,$tmp2\n\t"
5325             "pshufd  $tmp,$tmp2,0x01\n\t"
5326             "vaddss  $dst,$dst,$tmp\n\t"
5327             "pshufd  $tmp,$tmp2,0x02\n\t"
5328             "vaddss  $dst,$dst,$tmp\n\t"
5329             "pshufd  $tmp,$tmp2,0x03\n\t"
5330             "vaddss  $dst,$dst,$tmp\t! add reduction16F" %}
5331   ins_encode %{
5332     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5333     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5334     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5335     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5336     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5337     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5338     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5339     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5340     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5341     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5342     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5343     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5344     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5345     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5346     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5347     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5348     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5349     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5350     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5351     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5352     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5353     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5354     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5355     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5356     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5357     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5358     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5359     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5360     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5361     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5362     __ vaddss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5363   %}
5364   ins_pipe( pipe_slow );
5365 %}
5366 
5367 instruct rsadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5368   predicate(UseSSE >= 1 && UseAVX == 0);
5369   match(Set dst (AddReductionVD dst src2));
5370   effect(TEMP tmp, TEMP dst);
5371   format %{ "addsd   $dst,$src2\n\t"
5372             "pshufd  $tmp,$src2,0xE\n\t"
5373             "addsd   $dst,$tmp\t! add reduction2D" %}
5374   ins_encode %{
5375     __ addsd($dst$$XMMRegister, $src2$$XMMRegister);
5376     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5377     __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
5378   %}
5379   ins_pipe( pipe_slow );
5380 %}
5381 
5382 instruct rvadd2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5383   predicate(UseAVX > 0);
5384   match(Set dst (AddReductionVD dst src2));
5385   effect(TEMP tmp, TEMP dst);
5386   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5387             "pshufd  $tmp,$src2,0xE\n\t"
5388             "vaddsd  $dst,$dst,$tmp\t! add reduction2D" %}
5389   ins_encode %{
5390     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5391     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5392     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5393   %}
5394   ins_pipe( pipe_slow );
5395 %}
5396 
5397 instruct rvadd4D_reduction_reg(regD dst, vecY src2, vecX tmp, vecX tmp2) %{
5398   predicate(UseAVX > 0);
5399   match(Set dst (AddReductionVD dst src2));
5400   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5401   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5402             "pshufd  $tmp,$src2,0xE\n\t"
5403             "vaddsd  $dst,$dst,$tmp\n\t"
5404             "vextractf128  $tmp2,$src2,0x1\n\t"
5405             "vaddsd  $dst,$dst,$tmp2\n\t"
5406             "pshufd  $tmp,$tmp2,0xE\n\t"
5407             "vaddsd  $dst,$dst,$tmp\t! add reduction4D" %}
5408   ins_encode %{
5409     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5410     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5411     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5412     __ vextractf128($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5413     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5414     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5415     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5416   %}
5417   ins_pipe( pipe_slow );
5418 %}
5419 
5420 instruct rvadd8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5421   predicate(UseAVX > 2);
5422   match(Set dst (AddReductionVD dst src2));
5423   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5424   format %{ "vaddsd  $dst,$dst,$src2\n\t"
5425             "pshufd  $tmp,$src2,0xE\n\t"
5426             "vaddsd  $dst,$dst,$tmp\n\t"
5427             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5428             "vaddsd  $dst,$dst,$tmp2\n\t"
5429             "pshufd  $tmp,$tmp2,0xE\n\t"
5430             "vaddsd  $dst,$dst,$tmp\n\t"
5431             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5432             "vaddsd  $dst,$dst,$tmp2\n\t"
5433             "pshufd  $tmp,$tmp2,0xE\n\t"
5434             "vaddsd  $dst,$dst,$tmp\n\t"
5435             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5436             "vaddsd  $dst,$dst,$tmp2\n\t"
5437             "pshufd  $tmp,$tmp2,0xE\n\t"
5438             "vaddsd  $dst,$dst,$tmp\t! add reduction8D" %}
5439   ins_encode %{
5440     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5441     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5442     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5443     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5444     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5445     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5446     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5447     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5448     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5449     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5450     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5451     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5452     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5453     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5454     __ vaddsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5455   %}
5456   ins_pipe( pipe_slow );
5457 %}
5458 
5459 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5460   predicate(UseSSE > 3 && UseAVX == 0);
5461   match(Set dst (MulReductionVI src1 src2));
5462   effect(TEMP tmp, TEMP tmp2);
5463   format %{ "pshufd  $tmp2,$src2,0x1\n\t"
5464             "pmulld  $tmp2,$src2\n\t"
5465             "movd    $tmp,$src1\n\t"
5466             "pmulld  $tmp2,$tmp\n\t"
5467             "movd    $dst,$tmp2\t! mul reduction2I" %}
5468   ins_encode %{
5469     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5470     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5471     __ movdl($tmp$$XMMRegister, $src1$$Register);
5472     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5473     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5474   %}
5475   ins_pipe( pipe_slow );
5476 %}
5477 
5478 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, vecD tmp, vecD tmp2) %{
5479   predicate(UseAVX > 0);
5480   match(Set dst (MulReductionVI src1 src2));
5481   effect(TEMP tmp, TEMP tmp2);
5482   format %{ "pshufd   $tmp2,$src2,0x1\n\t"
5483             "vpmulld  $tmp,$src2,$tmp2\n\t"
5484             "movd     $tmp2,$src1\n\t"
5485             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5486             "movd     $dst,$tmp2\t! mul reduction2I" %}
5487   ins_encode %{
5488     int vector_len = 0;
5489     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5490     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5491     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5492     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5493     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5494   %}
5495   ins_pipe( pipe_slow );
5496 %}
5497 
5498 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5499   predicate(UseSSE > 3 && UseAVX == 0);
5500   match(Set dst (MulReductionVI src1 src2));
5501   effect(TEMP tmp, TEMP tmp2);
5502   format %{ "pshufd  $tmp2,$src2,0xE\n\t"
5503             "pmulld  $tmp2,$src2\n\t"
5504             "pshufd  $tmp,$tmp2,0x1\n\t"
5505             "pmulld  $tmp2,$tmp\n\t"
5506             "movd    $tmp,$src1\n\t"
5507             "pmulld  $tmp2,$tmp\n\t"
5508             "movd    $dst,$tmp2\t! mul reduction4I" %}
5509   ins_encode %{
5510     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5511     __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
5512     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
5513     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5514     __ movdl($tmp$$XMMRegister, $src1$$Register);
5515     __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
5516     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5517   %}
5518   ins_pipe( pipe_slow );
5519 %}
5520 
5521 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, vecX tmp, vecX tmp2) %{
5522   predicate(UseAVX > 0);
5523   match(Set dst (MulReductionVI src1 src2));
5524   effect(TEMP tmp, TEMP tmp2);
5525   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5526             "vpmulld  $tmp,$src2,$tmp2\n\t"
5527             "pshufd   $tmp2,$tmp,0x1\n\t"
5528             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5529             "movd     $tmp2,$src1\n\t"
5530             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5531             "movd     $dst,$tmp2\t! mul reduction4I" %}
5532   ins_encode %{
5533     int vector_len = 0;
5534     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5535     __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5536     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5537     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5538     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5539     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5540     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5541   %}
5542   ins_pipe( pipe_slow );
5543 %}
5544 
5545 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, vecY tmp, vecY tmp2) %{
5546   predicate(UseAVX > 1);
5547   match(Set dst (MulReductionVI src1 src2));
5548   effect(TEMP tmp, TEMP tmp2);
5549   format %{ "vextracti128_high  $tmp,$src2\n\t"
5550             "vpmulld  $tmp,$tmp,$src2\n\t"
5551             "pshufd   $tmp2,$tmp,0xE\n\t"
5552             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5553             "pshufd   $tmp2,$tmp,0x1\n\t"
5554             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5555             "movd     $tmp2,$src1\n\t"
5556             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5557             "movd     $dst,$tmp2\t! mul reduction8I" %}
5558   ins_encode %{
5559     int vector_len = 0;
5560     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5561     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, vector_len);
5562     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5563     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5564     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5565     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5566     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5567     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, vector_len);
5568     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5569   %}
5570   ins_pipe( pipe_slow );
5571 %}
5572 
5573 instruct rvmul16I_reduction_reg(rRegI dst, rRegI src1, legVecZ src2, legVecZ tmp, legVecZ tmp2, legVecZ tmp3) %{
5574   predicate(UseAVX > 2);
5575   match(Set dst (MulReductionVI src1 src2));
5576   effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
5577   format %{ "vextracti64x4_high  $tmp3,$src2\n\t"
5578             "vpmulld  $tmp3,$tmp3,$src2\n\t"
5579             "vextracti128_high  $tmp,$tmp3\n\t"
5580             "vpmulld  $tmp,$tmp,$src2\n\t"
5581             "pshufd   $tmp2,$tmp,0xE\n\t"
5582             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5583             "pshufd   $tmp2,$tmp,0x1\n\t"
5584             "vpmulld  $tmp,$tmp,$tmp2\n\t"
5585             "movd     $tmp2,$src1\n\t"
5586             "vpmulld  $tmp2,$tmp,$tmp2\n\t"
5587             "movd     $dst,$tmp2\t! mul reduction16I" %}
5588   ins_encode %{
5589     __ vextracti64x4_high($tmp3$$XMMRegister, $src2$$XMMRegister);
5590     __ vpmulld($tmp3$$XMMRegister, $tmp3$$XMMRegister, $src2$$XMMRegister, 1);
5591     __ vextracti128_high($tmp$$XMMRegister, $tmp3$$XMMRegister);
5592     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp3$$XMMRegister, 0);
5593     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
5594     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5595     __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
5596     __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5597     __ movdl($tmp2$$XMMRegister, $src1$$Register);
5598     __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5599     __ movdl($dst$$Register, $tmp2$$XMMRegister);
5600   %}
5601   ins_pipe( pipe_slow );
5602 %}
5603 
5604 #ifdef _LP64
5605 instruct rvmul2L_reduction_reg(rRegL dst, rRegL src1, vecX src2, vecX tmp, vecX tmp2) %{
5606   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5607   match(Set dst (MulReductionVL src1 src2));
5608   effect(TEMP tmp, TEMP tmp2);
5609   format %{ "pshufd   $tmp2,$src2,0xE\n\t"
5610             "vpmullq  $tmp,$src2,$tmp2\n\t"
5611             "movdq    $tmp2,$src1\n\t"
5612             "vpmullq  $tmp2,$tmp,$tmp2\n\t"
5613             "movdq    $dst,$tmp2\t! mul reduction2L" %}
5614   ins_encode %{
5615     __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
5616     __ vpmullq($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, 0);
5617     __ movdq($tmp2$$XMMRegister, $src1$$Register);
5618     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, 0);
5619     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5620   %}
5621   ins_pipe( pipe_slow );
5622 %}
5623 
5624 instruct rvmul4L_reduction_reg(rRegL dst, rRegL src1, vecY src2, vecY tmp, vecY tmp2) %{
5625   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5626   match(Set dst (MulReductionVL src1 src2));
5627   effect(TEMP tmp, TEMP tmp2);
5628   format %{ "vextracti128_high  $tmp,$src2\n\t"
5629             "vpmullq  $tmp2,$tmp,$src2\n\t"
5630             "pshufd   $tmp,$tmp2,0xE\n\t"
5631             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5632             "movdq    $tmp,$src1\n\t"
5633             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5634             "movdq    $dst,$tmp2\t! mul reduction4L" %}
5635   ins_encode %{
5636     __ vextracti128_high($tmp$$XMMRegister, $src2$$XMMRegister);
5637     __ vpmullq($tmp2$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, 0);
5638     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5639     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5640     __ movdq($tmp$$XMMRegister, $src1$$Register);
5641     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5642     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5643   %}
5644   ins_pipe( pipe_slow );
5645 %}
5646 
5647 instruct rvmul8L_reduction_reg(rRegL dst, rRegL src1, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5648   predicate(UseAVX > 2 && VM_Version::supports_avx512dq());
5649   match(Set dst (MulReductionVL src1 src2));
5650   effect(TEMP tmp, TEMP tmp2);
5651   format %{ "vextracti64x4_high  $tmp2,$src2\n\t"
5652             "vpmullq  $tmp2,$tmp2,$src2\n\t"
5653             "vextracti128_high  $tmp,$tmp2\n\t"
5654             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5655             "pshufd   $tmp,$tmp2,0xE\n\t"
5656             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5657             "movdq    $tmp,$src1\n\t"
5658             "vpmullq  $tmp2,$tmp2,$tmp\n\t"
5659             "movdq    $dst,$tmp2\t! mul reduction8L" %}
5660   ins_encode %{
5661     __ vextracti64x4_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5662     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $src2$$XMMRegister, 1);
5663     __ vextracti128_high($tmp$$XMMRegister, $tmp2$$XMMRegister);
5664     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5665     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5666     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5667     __ movdq($tmp$$XMMRegister, $src1$$Register);
5668     __ vpmullq($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, 0);
5669     __ movdq($dst$$Register, $tmp2$$XMMRegister);
5670   %}
5671   ins_pipe( pipe_slow );
5672 %}
5673 #endif
5674 
5675 instruct rsmul2F_reduction(regF dst, vecD src2, vecD tmp) %{
5676   predicate(UseSSE >= 1 && UseAVX == 0);
5677   match(Set dst (MulReductionVF dst src2));
5678   effect(TEMP dst, TEMP tmp);
5679   format %{ "mulss   $dst,$src2\n\t"
5680             "pshufd  $tmp,$src2,0x01\n\t"
5681             "mulss   $dst,$tmp\t! mul reduction2F" %}
5682   ins_encode %{
5683     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5684     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5685     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5686   %}
5687   ins_pipe( pipe_slow );
5688 %}
5689 
5690 instruct rvmul2F_reduction_reg(regF dst, vecD src2, vecD tmp) %{
5691   predicate(UseAVX > 0);
5692   match(Set dst (MulReductionVF dst src2));
5693   effect(TEMP tmp, TEMP dst);
5694   format %{ "vmulss  $dst,$dst,$src2\n\t"
5695             "pshufd  $tmp,$src2,0x01\n\t"
5696             "vmulss  $dst,$dst,$tmp\t! mul reduction2F" %}
5697   ins_encode %{
5698     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5699     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5700     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5701   %}
5702   ins_pipe( pipe_slow );
5703 %}
5704 
5705 instruct rsmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5706   predicate(UseSSE >= 1 && UseAVX == 0);
5707   match(Set dst (MulReductionVF dst src2));
5708   effect(TEMP dst, TEMP tmp);
5709   format %{ "mulss   $dst,$src2\n\t"
5710             "pshufd  $tmp,$src2,0x01\n\t"
5711             "mulss   $dst,$tmp\n\t"
5712             "pshufd  $tmp,$src2,0x02\n\t"
5713             "mulss   $dst,$tmp\n\t"
5714             "pshufd  $tmp,$src2,0x03\n\t"
5715             "mulss   $dst,$tmp\t! mul reduction4F" %}
5716   ins_encode %{
5717     __ mulss($dst$$XMMRegister, $src2$$XMMRegister);
5718     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5719     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5720     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5721     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5722     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5723     __ mulss($dst$$XMMRegister, $tmp$$XMMRegister);
5724   %}
5725   ins_pipe( pipe_slow );
5726 %}
5727 
5728 instruct rvmul4F_reduction_reg(regF dst, vecX src2, vecX tmp) %{
5729   predicate(UseAVX > 0);
5730   match(Set dst (MulReductionVF dst src2));
5731   effect(TEMP tmp, TEMP dst);
5732   format %{ "vmulss  $dst,$dst,$src2\n\t"
5733             "pshufd  $tmp,$src2,0x01\n\t"
5734             "vmulss  $dst,$dst,$tmp\n\t"
5735             "pshufd  $tmp,$src2,0x02\n\t"
5736             "vmulss  $dst,$dst,$tmp\n\t"
5737             "pshufd  $tmp,$src2,0x03\n\t"
5738             "vmulss  $dst,$dst,$tmp\t! mul reduction4F" %}
5739   ins_encode %{
5740     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5741     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5742     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5743     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5744     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5745     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5746     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5747   %}
5748   ins_pipe( pipe_slow );
5749 %}
5750 
5751 instruct rvmul8F_reduction_reg(regF dst, vecY src2, vecY tmp, vecY tmp2) %{
5752   predicate(UseAVX > 0);
5753   match(Set dst (MulReductionVF dst src2));
5754   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5755   format %{ "vmulss  $dst,$dst,$src2\n\t"
5756             "pshufd  $tmp,$src2,0x01\n\t"
5757             "vmulss  $dst,$dst,$tmp\n\t"
5758             "pshufd  $tmp,$src2,0x02\n\t"
5759             "vmulss  $dst,$dst,$tmp\n\t"
5760             "pshufd  $tmp,$src2,0x03\n\t"
5761             "vmulss  $dst,$dst,$tmp\n\t"
5762             "vextractf128_high  $tmp2,$src2\n\t"
5763             "vmulss  $dst,$dst,$tmp2\n\t"
5764             "pshufd  $tmp,$tmp2,0x01\n\t"
5765             "vmulss  $dst,$dst,$tmp\n\t"
5766             "pshufd  $tmp,$tmp2,0x02\n\t"
5767             "vmulss  $dst,$dst,$tmp\n\t"
5768             "pshufd  $tmp,$tmp2,0x03\n\t"
5769             "vmulss  $dst,$dst,$tmp\t! mul reduction8F" %}
5770   ins_encode %{
5771     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5772     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5773     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5774     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5775     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5776     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5777     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5778     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5779     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5780     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5781     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5782     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5783     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5784     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5785     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5786   %}
5787   ins_pipe( pipe_slow );
5788 %}
5789 
5790 instruct rvmul16F_reduction_reg(regF dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5791   predicate(UseAVX > 2);
5792   match(Set dst (MulReductionVF dst src2));
5793   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5794   format %{ "vmulss  $dst,$dst,$src2\n\t"
5795             "pshufd  $tmp,$src2,0x01\n\t"
5796             "vmulss  $dst,$dst,$tmp\n\t"
5797             "pshufd  $tmp,$src2,0x02\n\t"
5798             "vmulss  $dst,$dst,$tmp\n\t"
5799             "pshufd  $tmp,$src2,0x03\n\t"
5800             "vmulss  $dst,$dst,$tmp\n\t"
5801             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5802             "vmulss  $dst,$dst,$tmp2\n\t"
5803             "pshufd  $tmp,$tmp2,0x01\n\t"
5804             "vmulss  $dst,$dst,$tmp\n\t"
5805             "pshufd  $tmp,$tmp2,0x02\n\t"
5806             "vmulss  $dst,$dst,$tmp\n\t"
5807             "pshufd  $tmp,$tmp2,0x03\n\t"
5808             "vmulss  $dst,$dst,$tmp\n\t"
5809             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5810             "vmulss  $dst,$dst,$tmp2\n\t"
5811             "pshufd  $tmp,$tmp2,0x01\n\t"
5812             "vmulss  $dst,$dst,$tmp\n\t"
5813             "pshufd  $tmp,$tmp2,0x02\n\t"
5814             "vmulss  $dst,$dst,$tmp\n\t"
5815             "pshufd  $tmp,$tmp2,0x03\n\t"
5816             "vmulss  $dst,$dst,$tmp\n\t"
5817             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5818             "vmulss  $dst,$dst,$tmp2\n\t"
5819             "pshufd  $tmp,$tmp2,0x01\n\t"
5820             "vmulss  $dst,$dst,$tmp\n\t"
5821             "pshufd  $tmp,$tmp2,0x02\n\t"
5822             "vmulss  $dst,$dst,$tmp\n\t"
5823             "pshufd  $tmp,$tmp2,0x03\n\t"
5824             "vmulss  $dst,$dst,$tmp\t! mul reduction16F" %}
5825   ins_encode %{
5826     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5827     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
5828     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5829     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
5830     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5831     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
5832     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5833     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5834     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5835     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5836     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5837     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5838     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5839     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5840     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5841     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5842     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5843     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5844     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5845     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5846     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5847     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5848     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5849     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5850     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5851     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x01);
5852     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5853     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x02);
5854     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5855     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x03);
5856     __ vmulss($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5857   %}
5858   ins_pipe( pipe_slow );
5859 %}
5860 
5861 instruct rsmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5862   predicate(UseSSE >= 1 && UseAVX == 0);
5863   match(Set dst (MulReductionVD dst src2));
5864   effect(TEMP dst, TEMP tmp);
5865   format %{ "mulsd   $dst,$src2\n\t"
5866             "pshufd  $tmp,$src2,0xE\n\t"
5867             "mulsd   $dst,$tmp\t! mul reduction2D" %}
5868   ins_encode %{
5869     __ mulsd($dst$$XMMRegister, $src2$$XMMRegister);
5870     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5871     __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
5872   %}
5873   ins_pipe( pipe_slow );
5874 %}
5875 
5876 instruct rvmul2D_reduction_reg(regD dst, vecX src2, vecX tmp) %{
5877   predicate(UseAVX > 0);
5878   match(Set dst (MulReductionVD dst src2));
5879   effect(TEMP tmp, TEMP dst);
5880   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5881             "pshufd  $tmp,$src2,0xE\n\t"
5882             "vmulsd  $dst,$dst,$tmp\t! mul reduction2D" %}
5883   ins_encode %{
5884     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5885     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5886     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5887   %}
5888   ins_pipe( pipe_slow );
5889 %}
5890 
5891 instruct rvmul4D_reduction_reg(regD dst, vecY src2, vecY tmp, vecY tmp2) %{
5892   predicate(UseAVX > 0);
5893   match(Set dst (MulReductionVD dst src2));
5894   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5895   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5896             "pshufd  $tmp,$src2,0xE\n\t"
5897             "vmulsd  $dst,$dst,$tmp\n\t"
5898             "vextractf128_high  $tmp2,$src2\n\t"
5899             "vmulsd  $dst,$dst,$tmp2\n\t"
5900             "pshufd  $tmp,$tmp2,0xE\n\t"
5901             "vmulsd  $dst,$dst,$tmp\t! mul reduction4D" %}
5902   ins_encode %{
5903     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5904     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5905     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5906     __ vextractf128_high($tmp2$$XMMRegister, $src2$$XMMRegister);
5907     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5908     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5909     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5910   %}
5911   ins_pipe( pipe_slow );
5912 %}
5913 
5914 instruct rvmul8D_reduction_reg(regD dst, legVecZ src2, legVecZ tmp, legVecZ tmp2) %{
5915   predicate(UseAVX > 2);
5916   match(Set dst (MulReductionVD dst src2));
5917   effect(TEMP tmp, TEMP dst, TEMP tmp2);
5918   format %{ "vmulsd  $dst,$dst,$src2\n\t"
5919             "pshufd  $tmp,$src2,0xE\n\t"
5920             "vmulsd  $dst,$dst,$tmp\n\t"
5921             "vextractf32x4  $tmp2,$src2,0x1\n\t"
5922             "vmulsd  $dst,$dst,$tmp2\n\t"
5923             "pshufd  $tmp,$src2,0xE\n\t"
5924             "vmulsd  $dst,$dst,$tmp\n\t"
5925             "vextractf32x4  $tmp2,$src2,0x2\n\t"
5926             "vmulsd  $dst,$dst,$tmp2\n\t"
5927             "pshufd  $tmp,$tmp2,0xE\n\t"
5928             "vmulsd  $dst,$dst,$tmp\n\t"
5929             "vextractf32x4  $tmp2,$src2,0x3\n\t"
5930             "vmulsd  $dst,$dst,$tmp2\n\t"
5931             "pshufd  $tmp,$tmp2,0xE\n\t"
5932             "vmulsd  $dst,$dst,$tmp\t! mul reduction8D" %}
5933   ins_encode %{
5934     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $src2$$XMMRegister);
5935     __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
5936     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5937     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
5938     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5939     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5940     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5941     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x2);
5942     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5943     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5944     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5945     __ vextractf32x4($tmp2$$XMMRegister, $src2$$XMMRegister, 0x3);
5946     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister);
5947     __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0xE);
5948     __ vmulsd($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister);
5949   %}
5950   ins_pipe( pipe_slow );
5951 %}
5952 
5953 // ====================VECTOR ARITHMETIC=======================================
5954 
5955 // --------------------------------- ADD --------------------------------------
5956 
5957 // Bytes vector add
5958 instruct vadd4B(vecS dst, vecS src) %{
5959   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
5960   match(Set dst (AddVB dst src));
5961   format %{ "paddb   $dst,$src\t! add packed4B" %}
5962   ins_encode %{
5963     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5964   %}
5965   ins_pipe( pipe_slow );
5966 %}
5967 
5968 instruct vadd4B_reg(vecS dst, vecS src1, vecS src2) %{
5969   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5970   match(Set dst (AddVB src1 src2));
5971   format %{ "vpaddb  $dst,$src1,$src2\t! add packed4B" %}
5972   ins_encode %{
5973     int vector_len = 0;
5974     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
5975   %}
5976   ins_pipe( pipe_slow );
5977 %}
5978 
5979 
5980 instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
5981   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
5982   match(Set dst (AddVB src (LoadVector mem)));
5983   format %{ "vpaddb  $dst,$src,$mem\t! add packed4B" %}
5984   ins_encode %{
5985     int vector_len = 0;
5986     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
5987   %}
5988   ins_pipe( pipe_slow );
5989 %}
5990 
5991 instruct vadd8B(vecD dst, vecD src) %{
5992   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
5993   match(Set dst (AddVB dst src));
5994   format %{ "paddb   $dst,$src\t! add packed8B" %}
5995   ins_encode %{
5996     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
5997   %}
5998   ins_pipe( pipe_slow );
5999 %}
6000 
6001 instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
6002   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6003   match(Set dst (AddVB src1 src2));
6004   format %{ "vpaddb  $dst,$src1,$src2\t! add packed8B" %}
6005   ins_encode %{
6006     int vector_len = 0;
6007     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6008   %}
6009   ins_pipe( pipe_slow );
6010 %}
6011 
6012 
6013 instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
6014   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6015   match(Set dst (AddVB src (LoadVector mem)));
6016   format %{ "vpaddb  $dst,$src,$mem\t! add packed8B" %}
6017   ins_encode %{
6018     int vector_len = 0;
6019     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6020   %}
6021   ins_pipe( pipe_slow );
6022 %}
6023 
6024 instruct vadd16B(vecX dst, vecX src) %{
6025   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6026   match(Set dst (AddVB dst src));
6027   format %{ "paddb   $dst,$src\t! add packed16B" %}
6028   ins_encode %{
6029     __ paddb($dst$$XMMRegister, $src$$XMMRegister);
6030   %}
6031   ins_pipe( pipe_slow );
6032 %}
6033 
6034 instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
6035   predicate(UseAVX > 0  && n->as_Vector()->length() == 16);
6036   match(Set dst (AddVB src1 src2));
6037   format %{ "vpaddb  $dst,$src1,$src2\t! add packed16B" %}
6038   ins_encode %{
6039     int vector_len = 0;
6040     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6041   %}
6042   ins_pipe( pipe_slow );
6043 %}
6044 
6045 instruct vadd16B_mem(vecX dst, vecX src, memory mem) %{
6046   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6047   match(Set dst (AddVB src (LoadVector mem)));
6048   format %{ "vpaddb  $dst,$src,$mem\t! add packed16B" %}
6049   ins_encode %{
6050     int vector_len = 0;
6051     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6052   %}
6053   ins_pipe( pipe_slow );
6054 %}
6055 
6056 instruct vadd32B_reg(vecY dst, vecY src1, vecY src2) %{
6057   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6058   match(Set dst (AddVB src1 src2));
6059   format %{ "vpaddb  $dst,$src1,$src2\t! add packed32B" %}
6060   ins_encode %{
6061     int vector_len = 1;
6062     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6063   %}
6064   ins_pipe( pipe_slow );
6065 %}
6066 
6067 instruct vadd32B_mem(vecY dst, vecY src, memory mem) %{
6068   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6069   match(Set dst (AddVB src (LoadVector mem)));
6070   format %{ "vpaddb  $dst,$src,$mem\t! add packed32B" %}
6071   ins_encode %{
6072     int vector_len = 1;
6073     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6074   %}
6075   ins_pipe( pipe_slow );
6076 %}
6077 
6078 instruct vadd64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6079   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6080   match(Set dst (AddVB src1 src2));
6081   format %{ "vpaddb  $dst,$src1,$src2\t! add packed64B" %}
6082   ins_encode %{
6083     int vector_len = 2;
6084     __ vpaddb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6085   %}
6086   ins_pipe( pipe_slow );
6087 %}
6088 
6089 instruct vadd64B_mem(vecZ dst, vecZ src, memory mem) %{
6090   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6091   match(Set dst (AddVB src (LoadVector mem)));
6092   format %{ "vpaddb  $dst,$src,$mem\t! add packed64B" %}
6093   ins_encode %{
6094     int vector_len = 2;
6095     __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6096   %}
6097   ins_pipe( pipe_slow );
6098 %}
6099 
6100 // Shorts/Chars vector add
6101 instruct vadd2S(vecS dst, vecS src) %{
6102   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6103   match(Set dst (AddVS dst src));
6104   format %{ "paddw   $dst,$src\t! add packed2S" %}
6105   ins_encode %{
6106     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6107   %}
6108   ins_pipe( pipe_slow );
6109 %}
6110 
6111 instruct vadd2S_reg(vecS dst, vecS src1, vecS src2) %{
6112   predicate(UseAVX > 0  && n->as_Vector()->length() == 2);
6113   match(Set dst (AddVS src1 src2));
6114   format %{ "vpaddw  $dst,$src1,$src2\t! add packed2S" %}
6115   ins_encode %{
6116     int vector_len = 0;
6117     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6118   %}
6119   ins_pipe( pipe_slow );
6120 %}
6121 
6122 instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
6123   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6124   match(Set dst (AddVS src (LoadVector mem)));
6125   format %{ "vpaddw  $dst,$src,$mem\t! add packed2S" %}
6126   ins_encode %{
6127     int vector_len = 0;
6128     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6129   %}
6130   ins_pipe( pipe_slow );
6131 %}
6132 
6133 instruct vadd4S(vecD dst, vecD src) %{
6134   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6135   match(Set dst (AddVS dst src));
6136   format %{ "paddw   $dst,$src\t! add packed4S" %}
6137   ins_encode %{
6138     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6139   %}
6140   ins_pipe( pipe_slow );
6141 %}
6142 
6143 instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
6144   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6145   match(Set dst (AddVS src1 src2));
6146   format %{ "vpaddw  $dst,$src1,$src2\t! add packed4S" %}
6147   ins_encode %{
6148     int vector_len = 0;
6149     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6150   %}
6151   ins_pipe( pipe_slow );
6152 %}
6153 
6154 instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
6155   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6156   match(Set dst (AddVS src (LoadVector mem)));
6157   format %{ "vpaddw  $dst,$src,$mem\t! add packed4S" %}
6158   ins_encode %{
6159     int vector_len = 0;
6160     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6161   %}
6162   ins_pipe( pipe_slow );
6163 %}
6164 
6165 instruct vadd8S(vecX dst, vecX src) %{
6166   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6167   match(Set dst (AddVS dst src));
6168   format %{ "paddw   $dst,$src\t! add packed8S" %}
6169   ins_encode %{
6170     __ paddw($dst$$XMMRegister, $src$$XMMRegister);
6171   %}
6172   ins_pipe( pipe_slow );
6173 %}
6174 
6175 instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
6176   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6177   match(Set dst (AddVS src1 src2));
6178   format %{ "vpaddw  $dst,$src1,$src2\t! add packed8S" %}
6179   ins_encode %{
6180     int vector_len = 0;
6181     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6182   %}
6183   ins_pipe( pipe_slow );
6184 %}
6185 
6186 instruct vadd8S_mem(vecX dst, vecX src, memory mem) %{
6187   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6188   match(Set dst (AddVS src (LoadVector mem)));
6189   format %{ "vpaddw  $dst,$src,$mem\t! add packed8S" %}
6190   ins_encode %{
6191     int vector_len = 0;
6192     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6193   %}
6194   ins_pipe( pipe_slow );
6195 %}
6196 
6197 instruct vadd16S_reg(vecY dst, vecY src1, vecY src2) %{
6198   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6199   match(Set dst (AddVS src1 src2));
6200   format %{ "vpaddw  $dst,$src1,$src2\t! add packed16S" %}
6201   ins_encode %{
6202     int vector_len = 1;
6203     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6204   %}
6205   ins_pipe( pipe_slow );
6206 %}
6207 
6208 instruct vadd16S_mem(vecY dst, vecY src, memory mem) %{
6209   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6210   match(Set dst (AddVS src (LoadVector mem)));
6211   format %{ "vpaddw  $dst,$src,$mem\t! add packed16S" %}
6212   ins_encode %{
6213     int vector_len = 1;
6214     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6215   %}
6216   ins_pipe( pipe_slow );
6217 %}
6218 
6219 instruct vadd32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6220   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6221   match(Set dst (AddVS src1 src2));
6222   format %{ "vpaddw  $dst,$src1,$src2\t! add packed32S" %}
6223   ins_encode %{
6224     int vector_len = 2;
6225     __ vpaddw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6226   %}
6227   ins_pipe( pipe_slow );
6228 %}
6229 
6230 instruct vadd32S_mem(vecZ dst, vecZ src, memory mem) %{
6231   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6232   match(Set dst (AddVS src (LoadVector mem)));
6233   format %{ "vpaddw  $dst,$src,$mem\t! add packed32S" %}
6234   ins_encode %{
6235     int vector_len = 2;
6236     __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6237   %}
6238   ins_pipe( pipe_slow );
6239 %}
6240 
6241 // Integers vector add
6242 instruct vadd2I(vecD dst, vecD src) %{
6243   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6244   match(Set dst (AddVI dst src));
6245   format %{ "paddd   $dst,$src\t! add packed2I" %}
6246   ins_encode %{
6247     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6248   %}
6249   ins_pipe( pipe_slow );
6250 %}
6251 
6252 instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
6253   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6254   match(Set dst (AddVI src1 src2));
6255   format %{ "vpaddd  $dst,$src1,$src2\t! add packed2I" %}
6256   ins_encode %{
6257     int vector_len = 0;
6258     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6259   %}
6260   ins_pipe( pipe_slow );
6261 %}
6262 
6263 instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
6264   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6265   match(Set dst (AddVI src (LoadVector mem)));
6266   format %{ "vpaddd  $dst,$src,$mem\t! add packed2I" %}
6267   ins_encode %{
6268     int vector_len = 0;
6269     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6270   %}
6271   ins_pipe( pipe_slow );
6272 %}
6273 
6274 instruct vadd4I(vecX dst, vecX src) %{
6275   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6276   match(Set dst (AddVI dst src));
6277   format %{ "paddd   $dst,$src\t! add packed4I" %}
6278   ins_encode %{
6279     __ paddd($dst$$XMMRegister, $src$$XMMRegister);
6280   %}
6281   ins_pipe( pipe_slow );
6282 %}
6283 
6284 instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
6285   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6286   match(Set dst (AddVI src1 src2));
6287   format %{ "vpaddd  $dst,$src1,$src2\t! add packed4I" %}
6288   ins_encode %{
6289     int vector_len = 0;
6290     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6291   %}
6292   ins_pipe( pipe_slow );
6293 %}
6294 
6295 instruct vadd4I_mem(vecX dst, vecX src, memory mem) %{
6296   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6297   match(Set dst (AddVI src (LoadVector mem)));
6298   format %{ "vpaddd  $dst,$src,$mem\t! add packed4I" %}
6299   ins_encode %{
6300     int vector_len = 0;
6301     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6302   %}
6303   ins_pipe( pipe_slow );
6304 %}
6305 
6306 instruct vadd8I_reg(vecY dst, vecY src1, vecY src2) %{
6307   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6308   match(Set dst (AddVI src1 src2));
6309   format %{ "vpaddd  $dst,$src1,$src2\t! add packed8I" %}
6310   ins_encode %{
6311     int vector_len = 1;
6312     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6313   %}
6314   ins_pipe( pipe_slow );
6315 %}
6316 
6317 instruct vadd8I_mem(vecY dst, vecY src, memory mem) %{
6318   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6319   match(Set dst (AddVI src (LoadVector mem)));
6320   format %{ "vpaddd  $dst,$src,$mem\t! add packed8I" %}
6321   ins_encode %{
6322     int vector_len = 1;
6323     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6324   %}
6325   ins_pipe( pipe_slow );
6326 %}
6327 
6328 instruct vadd16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6329   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6330   match(Set dst (AddVI src1 src2));
6331   format %{ "vpaddd  $dst,$src1,$src2\t! add packed16I" %}
6332   ins_encode %{
6333     int vector_len = 2;
6334     __ vpaddd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6335   %}
6336   ins_pipe( pipe_slow );
6337 %}
6338 
6339 instruct vadd16I_mem(vecZ dst, vecZ src, memory mem) %{
6340   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6341   match(Set dst (AddVI src (LoadVector mem)));
6342   format %{ "vpaddd  $dst,$src,$mem\t! add packed16I" %}
6343   ins_encode %{
6344     int vector_len = 2;
6345     __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6346   %}
6347   ins_pipe( pipe_slow );
6348 %}
6349 
6350 // Longs vector add
6351 instruct vadd2L(vecX dst, vecX src) %{
6352   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6353   match(Set dst (AddVL dst src));
6354   format %{ "paddq   $dst,$src\t! add packed2L" %}
6355   ins_encode %{
6356     __ paddq($dst$$XMMRegister, $src$$XMMRegister);
6357   %}
6358   ins_pipe( pipe_slow );
6359 %}
6360 
6361 instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
6362   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6363   match(Set dst (AddVL src1 src2));
6364   format %{ "vpaddq  $dst,$src1,$src2\t! add packed2L" %}
6365   ins_encode %{
6366     int vector_len = 0;
6367     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6368   %}
6369   ins_pipe( pipe_slow );
6370 %}
6371 
6372 instruct vadd2L_mem(vecX dst, vecX src, memory mem) %{
6373   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6374   match(Set dst (AddVL src (LoadVector mem)));
6375   format %{ "vpaddq  $dst,$src,$mem\t! add packed2L" %}
6376   ins_encode %{
6377     int vector_len = 0;
6378     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6379   %}
6380   ins_pipe( pipe_slow );
6381 %}
6382 
6383 instruct vadd4L_reg(vecY dst, vecY src1, vecY src2) %{
6384   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6385   match(Set dst (AddVL src1 src2));
6386   format %{ "vpaddq  $dst,$src1,$src2\t! add packed4L" %}
6387   ins_encode %{
6388     int vector_len = 1;
6389     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6390   %}
6391   ins_pipe( pipe_slow );
6392 %}
6393 
6394 instruct vadd4L_mem(vecY dst, vecY src, memory mem) %{
6395   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
6396   match(Set dst (AddVL src (LoadVector mem)));
6397   format %{ "vpaddq  $dst,$src,$mem\t! add packed4L" %}
6398   ins_encode %{
6399     int vector_len = 1;
6400     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6401   %}
6402   ins_pipe( pipe_slow );
6403 %}
6404 
6405 instruct vadd8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
6406   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6407   match(Set dst (AddVL src1 src2));
6408   format %{ "vpaddq  $dst,$src1,$src2\t! add packed8L" %}
6409   ins_encode %{
6410     int vector_len = 2;
6411     __ vpaddq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6412   %}
6413   ins_pipe( pipe_slow );
6414 %}
6415 
6416 instruct vadd8L_mem(vecZ dst, vecZ src, memory mem) %{
6417   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6418   match(Set dst (AddVL src (LoadVector mem)));
6419   format %{ "vpaddq  $dst,$src,$mem\t! add packed8L" %}
6420   ins_encode %{
6421     int vector_len = 2;
6422     __ vpaddq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6423   %}
6424   ins_pipe( pipe_slow );
6425 %}
6426 
6427 // Floats vector add
6428 instruct vadd2F(vecD dst, vecD src) %{
6429   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6430   match(Set dst (AddVF dst src));
6431   format %{ "addps   $dst,$src\t! add packed2F" %}
6432   ins_encode %{
6433     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6434   %}
6435   ins_pipe( pipe_slow );
6436 %}
6437 
6438 instruct vadd2F_reg(vecD dst, vecD src1, vecD src2) %{
6439   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6440   match(Set dst (AddVF src1 src2));
6441   format %{ "vaddps  $dst,$src1,$src2\t! add packed2F" %}
6442   ins_encode %{
6443     int vector_len = 0;
6444     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6445   %}
6446   ins_pipe( pipe_slow );
6447 %}
6448 
6449 instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
6450   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6451   match(Set dst (AddVF src (LoadVector mem)));
6452   format %{ "vaddps  $dst,$src,$mem\t! add packed2F" %}
6453   ins_encode %{
6454     int vector_len = 0;
6455     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6456   %}
6457   ins_pipe( pipe_slow );
6458 %}
6459 
6460 instruct vadd4F(vecX dst, vecX src) %{
6461   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6462   match(Set dst (AddVF dst src));
6463   format %{ "addps   $dst,$src\t! add packed4F" %}
6464   ins_encode %{
6465     __ addps($dst$$XMMRegister, $src$$XMMRegister);
6466   %}
6467   ins_pipe( pipe_slow );
6468 %}
6469 
6470 instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{
6471   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6472   match(Set dst (AddVF src1 src2));
6473   format %{ "vaddps  $dst,$src1,$src2\t! add packed4F" %}
6474   ins_encode %{
6475     int vector_len = 0;
6476     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6477   %}
6478   ins_pipe( pipe_slow );
6479 %}
6480 
6481 instruct vadd4F_mem(vecX dst, vecX src, memory mem) %{
6482   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6483   match(Set dst (AddVF src (LoadVector mem)));
6484   format %{ "vaddps  $dst,$src,$mem\t! add packed4F" %}
6485   ins_encode %{
6486     int vector_len = 0;
6487     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6488   %}
6489   ins_pipe( pipe_slow );
6490 %}
6491 
6492 instruct vadd8F_reg(vecY dst, vecY src1, vecY src2) %{
6493   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6494   match(Set dst (AddVF src1 src2));
6495   format %{ "vaddps  $dst,$src1,$src2\t! add packed8F" %}
6496   ins_encode %{
6497     int vector_len = 1;
6498     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6499   %}
6500   ins_pipe( pipe_slow );
6501 %}
6502 
6503 instruct vadd8F_mem(vecY dst, vecY src, memory mem) %{
6504   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6505   match(Set dst (AddVF src (LoadVector mem)));
6506   format %{ "vaddps  $dst,$src,$mem\t! add packed8F" %}
6507   ins_encode %{
6508     int vector_len = 1;
6509     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6510   %}
6511   ins_pipe( pipe_slow );
6512 %}
6513 
6514 instruct vadd16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
6515   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6516   match(Set dst (AddVF src1 src2));
6517   format %{ "vaddps  $dst,$src1,$src2\t! add packed16F" %}
6518   ins_encode %{
6519     int vector_len = 2;
6520     __ vaddps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6521   %}
6522   ins_pipe( pipe_slow );
6523 %}
6524 
6525 instruct vadd16F_mem(vecZ dst, vecZ src, memory mem) %{
6526   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6527   match(Set dst (AddVF src (LoadVector mem)));
6528   format %{ "vaddps  $dst,$src,$mem\t! add packed16F" %}
6529   ins_encode %{
6530     int vector_len = 2;
6531     __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6532   %}
6533   ins_pipe( pipe_slow );
6534 %}
6535 
6536 // Doubles vector add
6537 instruct vadd2D(vecX dst, vecX src) %{
6538   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6539   match(Set dst (AddVD dst src));
6540   format %{ "addpd   $dst,$src\t! add packed2D" %}
6541   ins_encode %{
6542     __ addpd($dst$$XMMRegister, $src$$XMMRegister);
6543   %}
6544   ins_pipe( pipe_slow );
6545 %}
6546 
6547 instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{
6548   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6549   match(Set dst (AddVD src1 src2));
6550   format %{ "vaddpd  $dst,$src1,$src2\t! add packed2D" %}
6551   ins_encode %{
6552     int vector_len = 0;
6553     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6554   %}
6555   ins_pipe( pipe_slow );
6556 %}
6557 
6558 instruct vadd2D_mem(vecX dst, vecX src, memory mem) %{
6559   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6560   match(Set dst (AddVD src (LoadVector mem)));
6561   format %{ "vaddpd  $dst,$src,$mem\t! add packed2D" %}
6562   ins_encode %{
6563     int vector_len = 0;
6564     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6565   %}
6566   ins_pipe( pipe_slow );
6567 %}
6568 
6569 instruct vadd4D_reg(vecY dst, vecY src1, vecY src2) %{
6570   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6571   match(Set dst (AddVD src1 src2));
6572   format %{ "vaddpd  $dst,$src1,$src2\t! add packed4D" %}
6573   ins_encode %{
6574     int vector_len = 1;
6575     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6576   %}
6577   ins_pipe( pipe_slow );
6578 %}
6579 
6580 instruct vadd4D_mem(vecY dst, vecY src, memory mem) %{
6581   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6582   match(Set dst (AddVD src (LoadVector mem)));
6583   format %{ "vaddpd  $dst,$src,$mem\t! add packed4D" %}
6584   ins_encode %{
6585     int vector_len = 1;
6586     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6587   %}
6588   ins_pipe( pipe_slow );
6589 %}
6590 
6591 instruct vadd8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
6592   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6593   match(Set dst (AddVD src1 src2));
6594   format %{ "vaddpd  $dst,$src1,$src2\t! add packed8D" %}
6595   ins_encode %{
6596     int vector_len = 2;
6597     __ vaddpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6598   %}
6599   ins_pipe( pipe_slow );
6600 %}
6601 
6602 instruct vadd8D_mem(vecZ dst, vecZ src, memory mem) %{
6603   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
6604   match(Set dst (AddVD src (LoadVector mem)));
6605   format %{ "vaddpd  $dst,$src,$mem\t! add packed8D" %}
6606   ins_encode %{
6607     int vector_len = 2;
6608     __ vaddpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6609   %}
6610   ins_pipe( pipe_slow );
6611 %}
6612 
6613 // --------------------------------- SUB --------------------------------------
6614 
6615 // Bytes vector sub
6616 instruct vsub4B(vecS dst, vecS src) %{
6617   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6618   match(Set dst (SubVB dst src));
6619   format %{ "psubb   $dst,$src\t! sub packed4B" %}
6620   ins_encode %{
6621     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6622   %}
6623   ins_pipe( pipe_slow );
6624 %}
6625 
6626 instruct vsub4B_reg(vecS dst, vecS src1, vecS src2) %{
6627   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6628   match(Set dst (SubVB src1 src2));
6629   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed4B" %}
6630   ins_encode %{
6631     int vector_len = 0;
6632     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6633   %}
6634   ins_pipe( pipe_slow );
6635 %}
6636 
6637 instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
6638   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6639   match(Set dst (SubVB src (LoadVector mem)));
6640   format %{ "vpsubb  $dst,$src,$mem\t! sub packed4B" %}
6641   ins_encode %{
6642     int vector_len = 0;
6643     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6644   %}
6645   ins_pipe( pipe_slow );
6646 %}
6647 
6648 instruct vsub8B(vecD dst, vecD src) %{
6649   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6650   match(Set dst (SubVB dst src));
6651   format %{ "psubb   $dst,$src\t! sub packed8B" %}
6652   ins_encode %{
6653     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6654   %}
6655   ins_pipe( pipe_slow );
6656 %}
6657 
6658 instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
6659   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6660   match(Set dst (SubVB src1 src2));
6661   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed8B" %}
6662   ins_encode %{
6663     int vector_len = 0;
6664     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6665   %}
6666   ins_pipe( pipe_slow );
6667 %}
6668 
6669 instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
6670   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6671   match(Set dst (SubVB src (LoadVector mem)));
6672   format %{ "vpsubb  $dst,$src,$mem\t! sub packed8B" %}
6673   ins_encode %{
6674     int vector_len = 0;
6675     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6676   %}
6677   ins_pipe( pipe_slow );
6678 %}
6679 
6680 instruct vsub16B(vecX dst, vecX src) %{
6681   predicate(UseAVX == 0 && n->as_Vector()->length() == 16);
6682   match(Set dst (SubVB dst src));
6683   format %{ "psubb   $dst,$src\t! sub packed16B" %}
6684   ins_encode %{
6685     __ psubb($dst$$XMMRegister, $src$$XMMRegister);
6686   %}
6687   ins_pipe( pipe_slow );
6688 %}
6689 
6690 instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
6691   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6692   match(Set dst (SubVB src1 src2));
6693   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed16B" %}
6694   ins_encode %{
6695     int vector_len = 0;
6696     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6697   %}
6698   ins_pipe( pipe_slow );
6699 %}
6700 
6701 instruct vsub16B_mem(vecX dst, vecX src, memory mem) %{
6702   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
6703   match(Set dst (SubVB src (LoadVector mem)));
6704   format %{ "vpsubb  $dst,$src,$mem\t! sub packed16B" %}
6705   ins_encode %{
6706     int vector_len = 0;
6707     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6708   %}
6709   ins_pipe( pipe_slow );
6710 %}
6711 
6712 instruct vsub32B_reg(vecY dst, vecY src1, vecY src2) %{
6713   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6714   match(Set dst (SubVB src1 src2));
6715   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed32B" %}
6716   ins_encode %{
6717     int vector_len = 1;
6718     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6719   %}
6720   ins_pipe( pipe_slow );
6721 %}
6722 
6723 instruct vsub32B_mem(vecY dst, vecY src, memory mem) %{
6724   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
6725   match(Set dst (SubVB src (LoadVector mem)));
6726   format %{ "vpsubb  $dst,$src,$mem\t! sub packed32B" %}
6727   ins_encode %{
6728     int vector_len = 1;
6729     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6730   %}
6731   ins_pipe( pipe_slow );
6732 %}
6733 
6734 instruct vsub64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
6735   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6736   match(Set dst (SubVB src1 src2));
6737   format %{ "vpsubb  $dst,$src1,$src2\t! sub packed64B" %}
6738   ins_encode %{
6739     int vector_len = 2;
6740     __ vpsubb($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6741   %}
6742   ins_pipe( pipe_slow );
6743 %}
6744 
6745 instruct vsub64B_mem(vecZ dst, vecZ src, memory mem) %{
6746   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 64);
6747   match(Set dst (SubVB src (LoadVector mem)));
6748   format %{ "vpsubb  $dst,$src,$mem\t! sub packed64B" %}
6749   ins_encode %{
6750     int vector_len = 2;
6751     __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6752   %}
6753   ins_pipe( pipe_slow );
6754 %}
6755 
6756 // Shorts/Chars vector sub
6757 instruct vsub2S(vecS dst, vecS src) %{
6758   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6759   match(Set dst (SubVS dst src));
6760   format %{ "psubw   $dst,$src\t! sub packed2S" %}
6761   ins_encode %{
6762     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6763   %}
6764   ins_pipe( pipe_slow );
6765 %}
6766 
6767 instruct vsub2S_reg(vecS dst, vecS src1, vecS src2) %{
6768   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6769   match(Set dst (SubVS src1 src2));
6770   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed2S" %}
6771   ins_encode %{
6772     int vector_len = 0;
6773     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6774   %}
6775   ins_pipe( pipe_slow );
6776 %}
6777 
6778 instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
6779   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6780   match(Set dst (SubVS src (LoadVector mem)));
6781   format %{ "vpsubw  $dst,$src,$mem\t! sub packed2S" %}
6782   ins_encode %{
6783     int vector_len = 0;
6784     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6785   %}
6786   ins_pipe( pipe_slow );
6787 %}
6788 
6789 instruct vsub4S(vecD dst, vecD src) %{
6790   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6791   match(Set dst (SubVS dst src));
6792   format %{ "psubw   $dst,$src\t! sub packed4S" %}
6793   ins_encode %{
6794     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6795   %}
6796   ins_pipe( pipe_slow );
6797 %}
6798 
6799 instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
6800   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6801   match(Set dst (SubVS src1 src2));
6802   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed4S" %}
6803   ins_encode %{
6804     int vector_len = 0;
6805     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6806   %}
6807   ins_pipe( pipe_slow );
6808 %}
6809 
6810 instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
6811   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6812   match(Set dst (SubVS src (LoadVector mem)));
6813   format %{ "vpsubw  $dst,$src,$mem\t! sub packed4S" %}
6814   ins_encode %{
6815     int vector_len = 0;
6816     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6817   %}
6818   ins_pipe( pipe_slow );
6819 %}
6820 
6821 instruct vsub8S(vecX dst, vecX src) %{
6822   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
6823   match(Set dst (SubVS dst src));
6824   format %{ "psubw   $dst,$src\t! sub packed8S" %}
6825   ins_encode %{
6826     __ psubw($dst$$XMMRegister, $src$$XMMRegister);
6827   %}
6828   ins_pipe( pipe_slow );
6829 %}
6830 
6831 instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{
6832   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6833   match(Set dst (SubVS src1 src2));
6834   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed8S" %}
6835   ins_encode %{
6836     int vector_len = 0;
6837     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6838   %}
6839   ins_pipe( pipe_slow );
6840 %}
6841 
6842 instruct vsub8S_mem(vecX dst, vecX src, memory mem) %{
6843   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
6844   match(Set dst (SubVS src (LoadVector mem)));
6845   format %{ "vpsubw  $dst,$src,$mem\t! sub packed8S" %}
6846   ins_encode %{
6847     int vector_len = 0;
6848     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6849   %}
6850   ins_pipe( pipe_slow );
6851 %}
6852 
6853 instruct vsub16S_reg(vecY dst, vecY src1, vecY src2) %{
6854   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6855   match(Set dst (SubVS src1 src2));
6856   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed16S" %}
6857   ins_encode %{
6858     int vector_len = 1;
6859     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6860   %}
6861   ins_pipe( pipe_slow );
6862 %}
6863 
6864 instruct vsub16S_mem(vecY dst, vecY src, memory mem) %{
6865   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
6866   match(Set dst (SubVS src (LoadVector mem)));
6867   format %{ "vpsubw  $dst,$src,$mem\t! sub packed16S" %}
6868   ins_encode %{
6869     int vector_len = 1;
6870     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6871   %}
6872   ins_pipe( pipe_slow );
6873 %}
6874 
6875 instruct vsub32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
6876   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6877   match(Set dst (SubVS src1 src2));
6878   format %{ "vpsubw  $dst,$src1,$src2\t! sub packed32S" %}
6879   ins_encode %{
6880     int vector_len = 2;
6881     __ vpsubw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6882   %}
6883   ins_pipe( pipe_slow );
6884 %}
6885 
6886 instruct vsub32S_mem(vecZ dst, vecZ src, memory mem) %{
6887   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
6888   match(Set dst (SubVS src (LoadVector mem)));
6889   format %{ "vpsubw  $dst,$src,$mem\t! sub packed32S" %}
6890   ins_encode %{
6891     int vector_len = 2;
6892     __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6893   %}
6894   ins_pipe( pipe_slow );
6895 %}
6896 
6897 // Integers vector sub
6898 instruct vsub2I(vecD dst, vecD src) %{
6899   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
6900   match(Set dst (SubVI dst src));
6901   format %{ "psubd   $dst,$src\t! sub packed2I" %}
6902   ins_encode %{
6903     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6904   %}
6905   ins_pipe( pipe_slow );
6906 %}
6907 
6908 instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
6909   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6910   match(Set dst (SubVI src1 src2));
6911   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed2I" %}
6912   ins_encode %{
6913     int vector_len = 0;
6914     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6915   %}
6916   ins_pipe( pipe_slow );
6917 %}
6918 
6919 instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
6920   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
6921   match(Set dst (SubVI src (LoadVector mem)));
6922   format %{ "vpsubd  $dst,$src,$mem\t! sub packed2I" %}
6923   ins_encode %{
6924     int vector_len = 0;
6925     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6926   %}
6927   ins_pipe( pipe_slow );
6928 %}
6929 
6930 instruct vsub4I(vecX dst, vecX src) %{
6931   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
6932   match(Set dst (SubVI dst src));
6933   format %{ "psubd   $dst,$src\t! sub packed4I" %}
6934   ins_encode %{
6935     __ psubd($dst$$XMMRegister, $src$$XMMRegister);
6936   %}
6937   ins_pipe( pipe_slow );
6938 %}
6939 
6940 instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
6941   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6942   match(Set dst (SubVI src1 src2));
6943   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed4I" %}
6944   ins_encode %{
6945     int vector_len = 0;
6946     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6947   %}
6948   ins_pipe( pipe_slow );
6949 %}
6950 
6951 instruct vsub4I_mem(vecX dst, vecX src, memory mem) %{
6952   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
6953   match(Set dst (SubVI src (LoadVector mem)));
6954   format %{ "vpsubd  $dst,$src,$mem\t! sub packed4I" %}
6955   ins_encode %{
6956     int vector_len = 0;
6957     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6958   %}
6959   ins_pipe( pipe_slow );
6960 %}
6961 
6962 instruct vsub8I_reg(vecY dst, vecY src1, vecY src2) %{
6963   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6964   match(Set dst (SubVI src1 src2));
6965   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed8I" %}
6966   ins_encode %{
6967     int vector_len = 1;
6968     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6969   %}
6970   ins_pipe( pipe_slow );
6971 %}
6972 
6973 instruct vsub8I_mem(vecY dst, vecY src, memory mem) %{
6974   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
6975   match(Set dst (SubVI src (LoadVector mem)));
6976   format %{ "vpsubd  $dst,$src,$mem\t! sub packed8I" %}
6977   ins_encode %{
6978     int vector_len = 1;
6979     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
6980   %}
6981   ins_pipe( pipe_slow );
6982 %}
6983 
6984 instruct vsub16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
6985   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6986   match(Set dst (SubVI src1 src2));
6987   format %{ "vpsubd  $dst,$src1,$src2\t! sub packed16I" %}
6988   ins_encode %{
6989     int vector_len = 2;
6990     __ vpsubd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
6991   %}
6992   ins_pipe( pipe_slow );
6993 %}
6994 
6995 instruct vsub16I_mem(vecZ dst, vecZ src, memory mem) %{
6996   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
6997   match(Set dst (SubVI src (LoadVector mem)));
6998   format %{ "vpsubd  $dst,$src,$mem\t! sub packed16I" %}
6999   ins_encode %{
7000     int vector_len = 2;
7001     __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7002   %}
7003   ins_pipe( pipe_slow );
7004 %}
7005 
7006 // Longs vector sub
7007 instruct vsub2L(vecX dst, vecX src) %{
7008   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7009   match(Set dst (SubVL dst src));
7010   format %{ "psubq   $dst,$src\t! sub packed2L" %}
7011   ins_encode %{
7012     __ psubq($dst$$XMMRegister, $src$$XMMRegister);
7013   %}
7014   ins_pipe( pipe_slow );
7015 %}
7016 
7017 instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
7018   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7019   match(Set dst (SubVL src1 src2));
7020   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed2L" %}
7021   ins_encode %{
7022     int vector_len = 0;
7023     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7024   %}
7025   ins_pipe( pipe_slow );
7026 %}
7027 
7028 instruct vsub2L_mem(vecX dst, vecX src, memory mem) %{
7029   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7030   match(Set dst (SubVL src (LoadVector mem)));
7031   format %{ "vpsubq  $dst,$src,$mem\t! sub packed2L" %}
7032   ins_encode %{
7033     int vector_len = 0;
7034     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7035   %}
7036   ins_pipe( pipe_slow );
7037 %}
7038 
7039 instruct vsub4L_reg(vecY dst, vecY src1, vecY src2) %{
7040   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7041   match(Set dst (SubVL src1 src2));
7042   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed4L" %}
7043   ins_encode %{
7044     int vector_len = 1;
7045     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7046   %}
7047   ins_pipe( pipe_slow );
7048 %}
7049 
7050 instruct vsub4L_mem(vecY dst, vecY src, memory mem) %{
7051   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
7052   match(Set dst (SubVL src (LoadVector mem)));
7053   format %{ "vpsubq  $dst,$src,$mem\t! sub packed4L" %}
7054   ins_encode %{
7055     int vector_len = 1;
7056     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7057   %}
7058   ins_pipe( pipe_slow );
7059 %}
7060 
7061 instruct vsub8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7062   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7063   match(Set dst (SubVL src1 src2));
7064   format %{ "vpsubq  $dst,$src1,$src2\t! sub packed8L" %}
7065   ins_encode %{
7066     int vector_len = 2;
7067     __ vpsubq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7068   %}
7069   ins_pipe( pipe_slow );
7070 %}
7071 
7072 instruct vsub8L_mem(vecZ dst, vecZ src, memory mem) %{
7073   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7074   match(Set dst (SubVL src (LoadVector mem)));
7075   format %{ "vpsubq  $dst,$src,$mem\t! sub packed8L" %}
7076   ins_encode %{
7077     int vector_len = 2;
7078     __ vpsubq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7079   %}
7080   ins_pipe( pipe_slow );
7081 %}
7082 
7083 // Floats vector sub
7084 instruct vsub2F(vecD dst, vecD src) %{
7085   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7086   match(Set dst (SubVF dst src));
7087   format %{ "subps   $dst,$src\t! sub packed2F" %}
7088   ins_encode %{
7089     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7090   %}
7091   ins_pipe( pipe_slow );
7092 %}
7093 
7094 instruct vsub2F_reg(vecD dst, vecD src1, vecD src2) %{
7095   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7096   match(Set dst (SubVF src1 src2));
7097   format %{ "vsubps  $dst,$src1,$src2\t! sub packed2F" %}
7098   ins_encode %{
7099     int vector_len = 0;
7100     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7101   %}
7102   ins_pipe( pipe_slow );
7103 %}
7104 
7105 instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
7106   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7107   match(Set dst (SubVF src (LoadVector mem)));
7108   format %{ "vsubps  $dst,$src,$mem\t! sub packed2F" %}
7109   ins_encode %{
7110     int vector_len = 0;
7111     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7112   %}
7113   ins_pipe( pipe_slow );
7114 %}
7115 
7116 instruct vsub4F(vecX dst, vecX src) %{
7117   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7118   match(Set dst (SubVF dst src));
7119   format %{ "subps   $dst,$src\t! sub packed4F" %}
7120   ins_encode %{
7121     __ subps($dst$$XMMRegister, $src$$XMMRegister);
7122   %}
7123   ins_pipe( pipe_slow );
7124 %}
7125 
7126 instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{
7127   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7128   match(Set dst (SubVF src1 src2));
7129   format %{ "vsubps  $dst,$src1,$src2\t! sub packed4F" %}
7130   ins_encode %{
7131     int vector_len = 0;
7132     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7133   %}
7134   ins_pipe( pipe_slow );
7135 %}
7136 
7137 instruct vsub4F_mem(vecX dst, vecX src, memory mem) %{
7138   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7139   match(Set dst (SubVF src (LoadVector mem)));
7140   format %{ "vsubps  $dst,$src,$mem\t! sub packed4F" %}
7141   ins_encode %{
7142     int vector_len = 0;
7143     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7144   %}
7145   ins_pipe( pipe_slow );
7146 %}
7147 
7148 instruct vsub8F_reg(vecY dst, vecY src1, vecY src2) %{
7149   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7150   match(Set dst (SubVF src1 src2));
7151   format %{ "vsubps  $dst,$src1,$src2\t! sub packed8F" %}
7152   ins_encode %{
7153     int vector_len = 1;
7154     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7155   %}
7156   ins_pipe( pipe_slow );
7157 %}
7158 
7159 instruct vsub8F_mem(vecY dst, vecY src, memory mem) %{
7160   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7161   match(Set dst (SubVF src (LoadVector mem)));
7162   format %{ "vsubps  $dst,$src,$mem\t! sub packed8F" %}
7163   ins_encode %{
7164     int vector_len = 1;
7165     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7166   %}
7167   ins_pipe( pipe_slow );
7168 %}
7169 
7170 instruct vsub16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7171   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7172   match(Set dst (SubVF src1 src2));
7173   format %{ "vsubps  $dst,$src1,$src2\t! sub packed16F" %}
7174   ins_encode %{
7175     int vector_len = 2;
7176     __ vsubps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7177   %}
7178   ins_pipe( pipe_slow );
7179 %}
7180 
7181 instruct vsub16F_mem(vecZ dst, vecZ src, memory mem) %{
7182   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7183   match(Set dst (SubVF src (LoadVector mem)));
7184   format %{ "vsubps  $dst,$src,$mem\t! sub packed16F" %}
7185   ins_encode %{
7186     int vector_len = 2;
7187     __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7188   %}
7189   ins_pipe( pipe_slow );
7190 %}
7191 
7192 // Doubles vector sub
7193 instruct vsub2D(vecX dst, vecX src) %{
7194   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7195   match(Set dst (SubVD dst src));
7196   format %{ "subpd   $dst,$src\t! sub packed2D" %}
7197   ins_encode %{
7198     __ subpd($dst$$XMMRegister, $src$$XMMRegister);
7199   %}
7200   ins_pipe( pipe_slow );
7201 %}
7202 
7203 instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{
7204   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7205   match(Set dst (SubVD src1 src2));
7206   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed2D" %}
7207   ins_encode %{
7208     int vector_len = 0;
7209     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7210   %}
7211   ins_pipe( pipe_slow );
7212 %}
7213 
7214 instruct vsub2D_mem(vecX dst, vecX src, memory mem) %{
7215   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7216   match(Set dst (SubVD src (LoadVector mem)));
7217   format %{ "vsubpd  $dst,$src,$mem\t! sub packed2D" %}
7218   ins_encode %{
7219     int vector_len = 0;
7220     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7221   %}
7222   ins_pipe( pipe_slow );
7223 %}
7224 
7225 instruct vsub4D_reg(vecY dst, vecY src1, vecY src2) %{
7226   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7227   match(Set dst (SubVD src1 src2));
7228   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed4D" %}
7229   ins_encode %{
7230     int vector_len = 1;
7231     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7232   %}
7233   ins_pipe( pipe_slow );
7234 %}
7235 
7236 instruct vsub4D_mem(vecY dst, vecY src, memory mem) %{
7237   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7238   match(Set dst (SubVD src (LoadVector mem)));
7239   format %{ "vsubpd  $dst,$src,$mem\t! sub packed4D" %}
7240   ins_encode %{
7241     int vector_len = 1;
7242     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7243   %}
7244   ins_pipe( pipe_slow );
7245 %}
7246 
7247 instruct vsub8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7248   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7249   match(Set dst (SubVD src1 src2));
7250   format %{ "vsubpd  $dst,$src1,$src2\t! sub packed8D" %}
7251   ins_encode %{
7252     int vector_len = 2;
7253     __ vsubpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7254   %}
7255   ins_pipe( pipe_slow );
7256 %}
7257 
7258 instruct vsub8D_mem(vecZ dst, vecZ src, memory mem) %{
7259   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7260   match(Set dst (SubVD src (LoadVector mem)));
7261   format %{ "vsubpd  $dst,$src,$mem\t! sub packed8D" %}
7262   ins_encode %{
7263     int vector_len = 2;
7264     __ vsubpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7265   %}
7266   ins_pipe( pipe_slow );
7267 %}
7268 
7269 // --------------------------------- MUL --------------------------------------
7270 
7271 // Byte vector mul
7272 instruct mul4B_reg(vecS dst, vecS src1, vecS src2, vecS tmp, rRegI scratch) %{
7273   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7274   match(Set dst (MulVB src1 src2));
7275   effect(TEMP dst, TEMP tmp, TEMP scratch);
7276   format %{"pmovsxbw  $tmp,$src1\n\t"
7277            "pmovsxbw  $dst,$src2\n\t"
7278            "pmullw    $tmp,$dst\n\t"
7279            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
7280            "pand      $dst,$tmp\n\t"
7281            "packuswb  $dst,$dst\t! mul packed4B" %}
7282   ins_encode %{
7283     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
7284     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
7285     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
7286     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7287     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
7288     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
7289   %}
7290   ins_pipe( pipe_slow );
7291 %}
7292 
7293 instruct mul8B_reg(vecD dst, vecD src1, vecD src2, vecD tmp, rRegI scratch) %{
7294   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
7295   match(Set dst (MulVB src1 src2));
7296   effect(TEMP dst, TEMP tmp, TEMP scratch);
7297   format %{"pmovsxbw  $tmp,$src1\n\t"
7298            "pmovsxbw  $dst,$src2\n\t"
7299            "pmullw    $tmp,$dst\n\t"
7300            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
7301            "pand      $dst,$tmp\n\t"
7302            "packuswb  $dst,$dst\t! mul packed8B" %}
7303   ins_encode %{
7304     __ pmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister);
7305     __ pmovsxbw($dst$$XMMRegister, $src2$$XMMRegister);
7306     __ pmullw($tmp$$XMMRegister, $dst$$XMMRegister);
7307     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7308     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
7309     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
7310   %}
7311   ins_pipe( pipe_slow );
7312 %}
7313 
7314 instruct mul16B_reg(vecX dst, vecX src1, vecX src2, vecX tmp1, vecX tmp2, rRegI scratch) %{
7315   predicate(UseSSE > 3 && n->as_Vector()->length() == 16);
7316   match(Set dst (MulVB src1 src2));
7317   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
7318   format %{"pmovsxbw  $tmp1,$src1\n\t"
7319            "pmovsxbw  $tmp2,$src2\n\t"
7320            "pmullw    $tmp1,$tmp2\n\t"
7321            "pshufd    $tmp2,$src1,0xEE\n\t"
7322            "pshufd    $dst,$src2,0xEE\n\t"
7323            "pmovsxbw  $tmp2,$tmp2\n\t"
7324            "pmovsxbw  $dst,$dst\n\t"
7325            "pmullw    $tmp2,$dst\n\t"
7326            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
7327            "pand      $tmp2,$dst\n\t"
7328            "pand      $dst,$tmp1\n\t"
7329            "packuswb  $dst,$tmp2\t! mul packed16B" %}
7330   ins_encode %{
7331     __ pmovsxbw($tmp1$$XMMRegister, $src1$$XMMRegister);
7332     __ pmovsxbw($tmp2$$XMMRegister, $src2$$XMMRegister);
7333     __ pmullw($tmp1$$XMMRegister, $tmp2$$XMMRegister);
7334     __ pshufd($tmp2$$XMMRegister, $src1$$XMMRegister, 0xEE);
7335     __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xEE);
7336     __ pmovsxbw($tmp2$$XMMRegister, $tmp2$$XMMRegister);
7337     __ pmovsxbw($dst$$XMMRegister, $dst$$XMMRegister);
7338     __ pmullw($tmp2$$XMMRegister, $dst$$XMMRegister);
7339     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7340     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
7341     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
7342     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
7343   %}
7344   ins_pipe( pipe_slow );
7345 %}
7346 
7347 instruct vmul16B_reg_avx(vecX dst, vecX src1, vecX src2, vecX tmp, rRegI scratch) %{
7348   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7349   match(Set dst (MulVB src1 src2));
7350   effect(TEMP dst, TEMP tmp, TEMP scratch);
7351   format %{"vpmovsxbw  $tmp,$src1\n\t"
7352            "vpmovsxbw  $dst,$src2\n\t"
7353            "vpmullw    $tmp,$tmp,$dst\n\t"
7354            "vmovdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
7355            "vpand      $dst,$dst,$tmp\n\t"
7356            "vextracti128_high  $tmp,$dst\n\t"
7357            "vpackuswb  $dst,$dst,$dst\n\t! mul packed16B" %}
7358   ins_encode %{
7359   int vector_len = 1;
7360     __ vpmovsxbw($tmp$$XMMRegister, $src1$$XMMRegister, vector_len);
7361     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
7362     __ vpmullw($tmp$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, vector_len);
7363     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7364     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
7365     __ vextracti128_high($tmp$$XMMRegister, $dst$$XMMRegister);
7366     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, 0);
7367   %}
7368   ins_pipe( pipe_slow );
7369 %}
7370 
7371 instruct vmul32B_reg_avx(vecY dst, vecY src1, vecY src2, vecY tmp1, vecY tmp2, rRegI scratch) %{
7372   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
7373   match(Set dst (MulVB src1 src2));
7374   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
7375   format %{"vextracti128_high  $tmp1,$src1\n\t"
7376            "vextracti128_high  $dst,$src2\n\t"
7377            "vpmovsxbw $tmp1,$tmp1\n\t"
7378            "vpmovsxbw $dst,$dst\n\t"
7379            "vpmullw $tmp1,$tmp1,$dst\n\t"
7380            "vpmovsxbw $tmp2,$src1\n\t"
7381            "vpmovsxbw $dst,$src2\n\t"
7382            "vpmullw $tmp2,$tmp2,$dst\n\t"
7383            "vmovdqu $dst, [0x00ff00ff0x00ff00ff]\n\t"
7384            "vpbroadcastd $dst, $dst\n\t"
7385            "vpand $tmp1,$tmp1,$dst\n\t"
7386            "vpand $dst,$dst,$tmp2\n\t"
7387            "vpackuswb $dst,$dst,$tmp1\n\t"
7388            "vpermq $dst, $dst, 0xD8\t! mul packed32B" %}
7389   ins_encode %{
7390     int vector_len = 1;
7391     __ vextracti128_high($tmp1$$XMMRegister, $src1$$XMMRegister);
7392     __ vextracti128_high($dst$$XMMRegister, $src2$$XMMRegister);
7393     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7394     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
7395     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
7396     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
7397     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
7398     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
7399     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7400     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
7401     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
7402     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7403     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7404     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
7405   %}
7406   ins_pipe( pipe_slow );
7407 %}
7408 
7409 instruct vmul64B_reg_avx(vecZ dst, vecZ src1, vecZ src2, vecZ tmp1, vecZ tmp2, rRegI scratch) %{
7410   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
7411   match(Set dst (MulVB src1 src2));
7412   effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
7413   format %{"vextracti64x4_high  $tmp1,$src1\n\t"
7414            "vextracti64x4_high  $dst,$src2\n\t"
7415            "vpmovsxbw $tmp1,$tmp1\n\t"
7416            "vpmovsxbw $dst,$dst\n\t"
7417            "vpmullw $tmp1,$tmp1,$dst\n\t"
7418            "vpmovsxbw $tmp2,$src1\n\t"
7419            "vpmovsxbw $dst,$src2\n\t"
7420            "vpmullw $tmp2,$tmp2,$dst\n\t"
7421            "vmovdqu $dst, [0x00ff00ff0x00ff00ff]\n\t"
7422            "vpbroadcastd $dst, $dst\n\t"
7423            "vpand $tmp1,$tmp1,$dst\n\t"
7424            "vpand $tmp2,$tmp2,$dst\n\t"
7425            "vpackuswb $dst,$tmp1,$tmp2\n\t"
7426            "evmovdquq  $tmp2,[0x0604020007050301]\n\t"
7427            "vpermq $dst,$tmp2,$dst,0x01\t! mul packed64B" %}
7428 
7429   ins_encode %{
7430     int vector_len = 2;
7431     __ vextracti64x4_high($tmp1$$XMMRegister, $src1$$XMMRegister);
7432     __ vextracti64x4_high($dst$$XMMRegister, $src2$$XMMRegister);
7433     __ vpmovsxbw($tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
7434     __ vpmovsxbw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
7435     __ vpmullw($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
7436     __ vpmovsxbw($tmp2$$XMMRegister, $src1$$XMMRegister, vector_len);
7437     __ vpmovsxbw($dst$$XMMRegister, $src2$$XMMRegister, vector_len);
7438     __ vpmullw($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
7439     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
7440     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
7441     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
7442     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
7443     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len);
7444     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
7445     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
7446 
7447   %}
7448   ins_pipe( pipe_slow );
7449 %}
7450 
7451 // Shorts/Chars vector mul
7452 instruct vmul2S(vecS dst, vecS src) %{
7453   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7454   match(Set dst (MulVS dst src));
7455   format %{ "pmullw $dst,$src\t! mul packed2S" %}
7456   ins_encode %{
7457     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7458   %}
7459   ins_pipe( pipe_slow );
7460 %}
7461 
7462 instruct vmul2S_reg(vecS dst, vecS src1, vecS src2) %{
7463   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7464   match(Set dst (MulVS src1 src2));
7465   format %{ "vpmullw $dst,$src1,$src2\t! mul packed2S" %}
7466   ins_encode %{
7467     int vector_len = 0;
7468     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7469   %}
7470   ins_pipe( pipe_slow );
7471 %}
7472 
7473 instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
7474   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7475   match(Set dst (MulVS src (LoadVector mem)));
7476   format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
7477   ins_encode %{
7478     int vector_len = 0;
7479     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7480   %}
7481   ins_pipe( pipe_slow );
7482 %}
7483 
7484 instruct vmul4S(vecD dst, vecD src) %{
7485   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7486   match(Set dst (MulVS dst src));
7487   format %{ "pmullw  $dst,$src\t! mul packed4S" %}
7488   ins_encode %{
7489     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7490   %}
7491   ins_pipe( pipe_slow );
7492 %}
7493 
7494 instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
7495   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7496   match(Set dst (MulVS src1 src2));
7497   format %{ "vpmullw $dst,$src1,$src2\t! mul packed4S" %}
7498   ins_encode %{
7499     int vector_len = 0;
7500     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7501   %}
7502   ins_pipe( pipe_slow );
7503 %}
7504 
7505 instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
7506   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7507   match(Set dst (MulVS src (LoadVector mem)));
7508   format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
7509   ins_encode %{
7510     int vector_len = 0;
7511     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7512   %}
7513   ins_pipe( pipe_slow );
7514 %}
7515 
7516 instruct vmul8S(vecX dst, vecX src) %{
7517   predicate(UseAVX == 0 && n->as_Vector()->length() == 8);
7518   match(Set dst (MulVS dst src));
7519   format %{ "pmullw  $dst,$src\t! mul packed8S" %}
7520   ins_encode %{
7521     __ pmullw($dst$$XMMRegister, $src$$XMMRegister);
7522   %}
7523   ins_pipe( pipe_slow );
7524 %}
7525 
7526 instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
7527   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7528   match(Set dst (MulVS src1 src2));
7529   format %{ "vpmullw $dst,$src1,$src2\t! mul packed8S" %}
7530   ins_encode %{
7531     int vector_len = 0;
7532     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7533   %}
7534   ins_pipe( pipe_slow );
7535 %}
7536 
7537 instruct vmul8S_mem(vecX dst, vecX src, memory mem) %{
7538   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7539   match(Set dst (MulVS src (LoadVector mem)));
7540   format %{ "vpmullw $dst,$src,$mem\t! mul packed8S" %}
7541   ins_encode %{
7542     int vector_len = 0;
7543     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7544   %}
7545   ins_pipe( pipe_slow );
7546 %}
7547 
7548 instruct vmul16S_reg(vecY dst, vecY src1, vecY src2) %{
7549   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7550   match(Set dst (MulVS src1 src2));
7551   format %{ "vpmullw $dst,$src1,$src2\t! mul packed16S" %}
7552   ins_encode %{
7553     int vector_len = 1;
7554     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7555   %}
7556   ins_pipe( pipe_slow );
7557 %}
7558 
7559 instruct vmul16S_mem(vecY dst, vecY src, memory mem) %{
7560   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
7561   match(Set dst (MulVS src (LoadVector mem)));
7562   format %{ "vpmullw $dst,$src,$mem\t! mul packed16S" %}
7563   ins_encode %{
7564     int vector_len = 1;
7565     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7566   %}
7567   ins_pipe( pipe_slow );
7568 %}
7569 
7570 instruct vmul32S_reg(vecZ dst, vecZ src1, vecZ src2) %{
7571   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7572   match(Set dst (MulVS src1 src2));
7573   format %{ "vpmullw $dst,$src1,$src2\t! mul packed32S" %}
7574   ins_encode %{
7575     int vector_len = 2;
7576     __ vpmullw($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7577   %}
7578   ins_pipe( pipe_slow );
7579 %}
7580 
7581 instruct vmul32S_mem(vecZ dst, vecZ src, memory mem) %{
7582   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
7583   match(Set dst (MulVS src (LoadVector mem)));
7584   format %{ "vpmullw $dst,$src,$mem\t! mul packed32S" %}
7585   ins_encode %{
7586     int vector_len = 2;
7587     __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7588   %}
7589   ins_pipe( pipe_slow );
7590 %}
7591 
7592 // Integers vector mul (sse4_1)
7593 instruct vmul2I(vecD dst, vecD src) %{
7594   predicate(UseSSE > 3 && n->as_Vector()->length() == 2);
7595   match(Set dst (MulVI dst src));
7596   format %{ "pmulld  $dst,$src\t! mul packed2I" %}
7597   ins_encode %{
7598     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7599   %}
7600   ins_pipe( pipe_slow );
7601 %}
7602 
7603 instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
7604   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7605   match(Set dst (MulVI src1 src2));
7606   format %{ "vpmulld $dst,$src1,$src2\t! mul packed2I" %}
7607   ins_encode %{
7608     int vector_len = 0;
7609     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7610   %}
7611   ins_pipe( pipe_slow );
7612 %}
7613 
7614 instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
7615   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7616   match(Set dst (MulVI src (LoadVector mem)));
7617   format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
7618   ins_encode %{
7619     int vector_len = 0;
7620     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7621   %}
7622   ins_pipe( pipe_slow );
7623 %}
7624 
7625 instruct vmul4I(vecX dst, vecX src) %{
7626   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
7627   match(Set dst (MulVI dst src));
7628   format %{ "pmulld  $dst,$src\t! mul packed4I" %}
7629   ins_encode %{
7630     __ pmulld($dst$$XMMRegister, $src$$XMMRegister);
7631   %}
7632   ins_pipe( pipe_slow );
7633 %}
7634 
7635 instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
7636   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7637   match(Set dst (MulVI src1 src2));
7638   format %{ "vpmulld $dst,$src1,$src2\t! mul packed4I" %}
7639   ins_encode %{
7640     int vector_len = 0;
7641     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7642   %}
7643   ins_pipe( pipe_slow );
7644 %}
7645 
7646 instruct vmul4I_mem(vecX dst, vecX src, memory mem) %{
7647   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7648   match(Set dst (MulVI src (LoadVector mem)));
7649   format %{ "vpmulld $dst,$src,$mem\t! mul packed4I" %}
7650   ins_encode %{
7651     int vector_len = 0;
7652     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7653   %}
7654   ins_pipe( pipe_slow );
7655 %}
7656 
7657 instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
7658   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7659   match(Set dst (MulVL src1 src2));
7660   format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
7661   ins_encode %{
7662     int vector_len = 0;
7663     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7664   %}
7665   ins_pipe( pipe_slow );
7666 %}
7667 
7668 instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
7669   predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
7670   match(Set dst (MulVL src (LoadVector mem)));
7671   format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
7672   ins_encode %{
7673     int vector_len = 0;
7674     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7675   %}
7676   ins_pipe( pipe_slow );
7677 %}
7678 
7679 instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
7680   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7681   match(Set dst (MulVL src1 src2));
7682   format %{ "vpmullq $dst,$src1,$src2\t! mul packed4L" %}
7683   ins_encode %{
7684     int vector_len = 1;
7685     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7686   %}
7687   ins_pipe( pipe_slow );
7688 %}
7689 
7690 instruct vmul4L_mem(vecY dst, vecY src, memory mem) %{
7691   predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
7692   match(Set dst (MulVL src (LoadVector mem)));
7693   format %{ "vpmullq $dst,$src,$mem\t! mul packed4L" %}
7694   ins_encode %{
7695     int vector_len = 1;
7696     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7697   %}
7698   ins_pipe( pipe_slow );
7699 %}
7700 
7701 instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
7702   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7703   match(Set dst (MulVL src1 src2));
7704   format %{ "vpmullq $dst,$src1,$src2\t! mul packed8L" %}
7705   ins_encode %{
7706     int vector_len = 2;
7707     __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7708   %}
7709   ins_pipe( pipe_slow );
7710 %}
7711 
7712 instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
7713   predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
7714   match(Set dst (MulVL src (LoadVector mem)));
7715   format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
7716   ins_encode %{
7717     int vector_len = 2;
7718     __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7719   %}
7720   ins_pipe( pipe_slow );
7721 %}
7722 
7723 instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
7724   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7725   match(Set dst (MulVI src1 src2));
7726   format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
7727   ins_encode %{
7728     int vector_len = 1;
7729     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7730   %}
7731   ins_pipe( pipe_slow );
7732 %}
7733 
7734 instruct vmul8I_mem(vecY dst, vecY src, memory mem) %{
7735   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
7736   match(Set dst (MulVI src (LoadVector mem)));
7737   format %{ "vpmulld $dst,$src,$mem\t! mul packed8I" %}
7738   ins_encode %{
7739     int vector_len = 1;
7740     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7741   %}
7742   ins_pipe( pipe_slow );
7743 %}
7744 
7745 instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
7746   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7747   match(Set dst (MulVI src1 src2));
7748   format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
7749   ins_encode %{
7750     int vector_len = 2;
7751     __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7752   %}
7753   ins_pipe( pipe_slow );
7754 %}
7755 
7756 instruct vmul16I_mem(vecZ dst, vecZ src, memory mem) %{
7757   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7758   match(Set dst (MulVI src (LoadVector mem)));
7759   format %{ "vpmulld $dst,$src,$mem\t! mul packed16I" %}
7760   ins_encode %{
7761     int vector_len = 2;
7762     __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7763   %}
7764   ins_pipe( pipe_slow );
7765 %}
7766 
7767 // Floats vector mul
7768 instruct vmul2F(vecD dst, vecD src) %{
7769   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7770   match(Set dst (MulVF dst src));
7771   format %{ "mulps   $dst,$src\t! mul packed2F" %}
7772   ins_encode %{
7773     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7774   %}
7775   ins_pipe( pipe_slow );
7776 %}
7777 
7778 instruct vmul2F_reg(vecD dst, vecD src1, vecD src2) %{
7779   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7780   match(Set dst (MulVF src1 src2));
7781   format %{ "vmulps  $dst,$src1,$src2\t! mul packed2F" %}
7782   ins_encode %{
7783     int vector_len = 0;
7784     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7785   %}
7786   ins_pipe( pipe_slow );
7787 %}
7788 
7789 instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
7790   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7791   match(Set dst (MulVF src (LoadVector mem)));
7792   format %{ "vmulps  $dst,$src,$mem\t! mul packed2F" %}
7793   ins_encode %{
7794     int vector_len = 0;
7795     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7796   %}
7797   ins_pipe( pipe_slow );
7798 %}
7799 
7800 instruct vmul4F(vecX dst, vecX src) %{
7801   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
7802   match(Set dst (MulVF dst src));
7803   format %{ "mulps   $dst,$src\t! mul packed4F" %}
7804   ins_encode %{
7805     __ mulps($dst$$XMMRegister, $src$$XMMRegister);
7806   %}
7807   ins_pipe( pipe_slow );
7808 %}
7809 
7810 instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{
7811   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7812   match(Set dst (MulVF src1 src2));
7813   format %{ "vmulps  $dst,$src1,$src2\t! mul packed4F" %}
7814   ins_encode %{
7815     int vector_len = 0;
7816     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7817   %}
7818   ins_pipe( pipe_slow );
7819 %}
7820 
7821 instruct vmul4F_mem(vecX dst, vecX src, memory mem) %{
7822   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7823   match(Set dst (MulVF src (LoadVector mem)));
7824   format %{ "vmulps  $dst,$src,$mem\t! mul packed4F" %}
7825   ins_encode %{
7826     int vector_len = 0;
7827     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7828   %}
7829   ins_pipe( pipe_slow );
7830 %}
7831 
7832 instruct vmul8F_reg(vecY dst, vecY src1, vecY src2) %{
7833   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7834   match(Set dst (MulVF src1 src2));
7835   format %{ "vmulps  $dst,$src1,$src2\t! mul packed8F" %}
7836   ins_encode %{
7837     int vector_len = 1;
7838     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7839   %}
7840   ins_pipe( pipe_slow );
7841 %}
7842 
7843 instruct vmul8F_mem(vecY dst, vecY src, memory mem) %{
7844   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7845   match(Set dst (MulVF src (LoadVector mem)));
7846   format %{ "vmulps  $dst,$src,$mem\t! mul packed8F" %}
7847   ins_encode %{
7848     int vector_len = 1;
7849     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7850   %}
7851   ins_pipe( pipe_slow );
7852 %}
7853 
7854 instruct vmul16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
7855   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7856   match(Set dst (MulVF src1 src2));
7857   format %{ "vmulps  $dst,$src1,$src2\t! mul packed16F" %}
7858   ins_encode %{
7859     int vector_len = 2;
7860     __ vmulps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7861   %}
7862   ins_pipe( pipe_slow );
7863 %}
7864 
7865 instruct vmul16F_mem(vecZ dst, vecZ src, memory mem) %{
7866   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
7867   match(Set dst (MulVF src (LoadVector mem)));
7868   format %{ "vmulps  $dst,$src,$mem\t! mul packed16F" %}
7869   ins_encode %{
7870     int vector_len = 2;
7871     __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7872   %}
7873   ins_pipe( pipe_slow );
7874 %}
7875 
7876 // Doubles vector mul
7877 instruct vmul2D(vecX dst, vecX src) %{
7878   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7879   match(Set dst (MulVD dst src));
7880   format %{ "mulpd   $dst,$src\t! mul packed2D" %}
7881   ins_encode %{
7882     __ mulpd($dst$$XMMRegister, $src$$XMMRegister);
7883   %}
7884   ins_pipe( pipe_slow );
7885 %}
7886 
7887 instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{
7888   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7889   match(Set dst (MulVD src1 src2));
7890   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed2D" %}
7891   ins_encode %{
7892     int vector_len = 0;
7893     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7894   %}
7895   ins_pipe( pipe_slow );
7896 %}
7897 
7898 instruct vmul2D_mem(vecX dst, vecX src, memory mem) %{
7899   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
7900   match(Set dst (MulVD src (LoadVector mem)));
7901   format %{ "vmulpd  $dst,$src,$mem\t! mul packed2D" %}
7902   ins_encode %{
7903     int vector_len = 0;
7904     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7905   %}
7906   ins_pipe( pipe_slow );
7907 %}
7908 
7909 instruct vmul4D_reg(vecY dst, vecY src1, vecY src2) %{
7910   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7911   match(Set dst (MulVD src1 src2));
7912   format %{ "vmulpd  $dst,$src1,$src2\t! mul packed4D" %}
7913   ins_encode %{
7914     int vector_len = 1;
7915     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7916   %}
7917   ins_pipe( pipe_slow );
7918 %}
7919 
7920 instruct vmul4D_mem(vecY dst, vecY src, memory mem) %{
7921   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7922   match(Set dst (MulVD src (LoadVector mem)));
7923   format %{ "vmulpd  $dst,$src,$mem\t! mul packed4D" %}
7924   ins_encode %{
7925     int vector_len = 1;
7926     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7927   %}
7928   ins_pipe( pipe_slow );
7929 %}
7930 
7931 instruct vmul8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
7932   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7933   match(Set dst (MulVD src1 src2));
7934   format %{ "vmulpd  $dst k0,$src1,$src2\t! mul packed8D" %}
7935   ins_encode %{
7936     int vector_len = 2;
7937     __ vmulpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
7938   %}
7939   ins_pipe( pipe_slow );
7940 %}
7941 
7942 instruct vmul8D_mem(vecZ dst, vecZ src, memory mem) %{
7943   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
7944   match(Set dst (MulVD src (LoadVector mem)));
7945   format %{ "vmulpd  $dst k0,$src,$mem\t! mul packed8D" %}
7946   ins_encode %{
7947     int vector_len = 2;
7948     __ vmulpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
7949   %}
7950   ins_pipe( pipe_slow );
7951 %}
7952 
7953 instruct vcmov8F_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
7954   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
7955   match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
7956   effect(TEMP dst, USE src1, USE src2);
7957   format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
7958             "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
7959          %}
7960   ins_encode %{
7961     int vector_len = 1;
7962     int cond = (Assembler::Condition)($copnd$$cmpcode);
7963     __ cmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
7964     __ blendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
7965   %}
7966   ins_pipe( pipe_slow );
7967 %}
7968 
7969 instruct vcmov4D_reg(legVecY dst, legVecY src1, legVecY src2, immI8 cop, cmpOp_vcmppd copnd) %{
7970   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
7971   match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
7972   effect(TEMP dst, USE src1, USE src2);
7973   format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
7974             "blendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
7975          %}
7976   ins_encode %{
7977     int vector_len = 1;
7978     int cond = (Assembler::Condition)($copnd$$cmpcode);
7979     __ cmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vector_len);
7980     __ blendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vector_len);
7981   %}
7982   ins_pipe( pipe_slow );
7983 %}
7984 
7985 // --------------------------------- DIV --------------------------------------
7986 
7987 // Floats vector div
7988 instruct vdiv2F(vecD dst, vecD src) %{
7989   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
7990   match(Set dst (DivVF dst src));
7991   format %{ "divps   $dst,$src\t! div packed2F" %}
7992   ins_encode %{
7993     __ divps($dst$$XMMRegister, $src$$XMMRegister);
7994   %}
7995   ins_pipe( pipe_slow );
7996 %}
7997 
7998 instruct vdiv2F_reg(vecD dst, vecD src1, vecD src2) %{
7999   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8000   match(Set dst (DivVF src1 src2));
8001   format %{ "vdivps  $dst,$src1,$src2\t! div packed2F" %}
8002   ins_encode %{
8003     int vector_len = 0;
8004     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8005   %}
8006   ins_pipe( pipe_slow );
8007 %}
8008 
8009 instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
8010   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8011   match(Set dst (DivVF src (LoadVector mem)));
8012   format %{ "vdivps  $dst,$src,$mem\t! div packed2F" %}
8013   ins_encode %{
8014     int vector_len = 0;
8015     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8016   %}
8017   ins_pipe( pipe_slow );
8018 %}
8019 
8020 instruct vdiv4F(vecX dst, vecX src) %{
8021   predicate(UseAVX == 0 && n->as_Vector()->length() == 4);
8022   match(Set dst (DivVF dst src));
8023   format %{ "divps   $dst,$src\t! div packed4F" %}
8024   ins_encode %{
8025     __ divps($dst$$XMMRegister, $src$$XMMRegister);
8026   %}
8027   ins_pipe( pipe_slow );
8028 %}
8029 
8030 instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{
8031   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8032   match(Set dst (DivVF src1 src2));
8033   format %{ "vdivps  $dst,$src1,$src2\t! div packed4F" %}
8034   ins_encode %{
8035     int vector_len = 0;
8036     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8037   %}
8038   ins_pipe( pipe_slow );
8039 %}
8040 
8041 instruct vdiv4F_mem(vecX dst, vecX src, memory mem) %{
8042   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8043   match(Set dst (DivVF src (LoadVector mem)));
8044   format %{ "vdivps  $dst,$src,$mem\t! div packed4F" %}
8045   ins_encode %{
8046     int vector_len = 0;
8047     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8048   %}
8049   ins_pipe( pipe_slow );
8050 %}
8051 
8052 instruct vdiv8F_reg(vecY dst, vecY src1, vecY src2) %{
8053   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8054   match(Set dst (DivVF src1 src2));
8055   format %{ "vdivps  $dst,$src1,$src2\t! div packed8F" %}
8056   ins_encode %{
8057     int vector_len = 1;
8058     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8059   %}
8060   ins_pipe( pipe_slow );
8061 %}
8062 
8063 instruct vdiv8F_mem(vecY dst, vecY src, memory mem) %{
8064   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8065   match(Set dst (DivVF src (LoadVector mem)));
8066   format %{ "vdivps  $dst,$src,$mem\t! div packed8F" %}
8067   ins_encode %{
8068     int vector_len = 1;
8069     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8070   %}
8071   ins_pipe( pipe_slow );
8072 %}
8073 
8074 instruct vdiv16F_reg(vecZ dst, vecZ src1, vecZ src2) %{
8075   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8076   match(Set dst (DivVF src1 src2));
8077   format %{ "vdivps  $dst,$src1,$src2\t! div packed16F" %}
8078   ins_encode %{
8079     int vector_len = 2;
8080     __ vdivps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8081   %}
8082   ins_pipe( pipe_slow );
8083 %}
8084 
8085 instruct vdiv16F_mem(vecZ dst, vecZ src, memory mem) %{
8086   predicate(UseAVX > 0 && n->as_Vector()->length() == 16);
8087   match(Set dst (DivVF src (LoadVector mem)));
8088   format %{ "vdivps  $dst,$src,$mem\t! div packed16F" %}
8089   ins_encode %{
8090     int vector_len = 2;
8091     __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8092   %}
8093   ins_pipe( pipe_slow );
8094 %}
8095 
8096 // Doubles vector div
8097 instruct vdiv2D(vecX dst, vecX src) %{
8098   predicate(UseAVX == 0 && n->as_Vector()->length() == 2);
8099   match(Set dst (DivVD dst src));
8100   format %{ "divpd   $dst,$src\t! div packed2D" %}
8101   ins_encode %{
8102     __ divpd($dst$$XMMRegister, $src$$XMMRegister);
8103   %}
8104   ins_pipe( pipe_slow );
8105 %}
8106 
8107 instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{
8108   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8109   match(Set dst (DivVD src1 src2));
8110   format %{ "vdivpd  $dst,$src1,$src2\t! div packed2D" %}
8111   ins_encode %{
8112     int vector_len = 0;
8113     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8114   %}
8115   ins_pipe( pipe_slow );
8116 %}
8117 
8118 instruct vdiv2D_mem(vecX dst, vecX src, memory mem) %{
8119   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8120   match(Set dst (DivVD src (LoadVector mem)));
8121   format %{ "vdivpd  $dst,$src,$mem\t! div packed2D" %}
8122   ins_encode %{
8123     int vector_len = 0;
8124     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8125   %}
8126   ins_pipe( pipe_slow );
8127 %}
8128 
8129 instruct vdiv4D_reg(vecY dst, vecY src1, vecY src2) %{
8130   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8131   match(Set dst (DivVD src1 src2));
8132   format %{ "vdivpd  $dst,$src1,$src2\t! div packed4D" %}
8133   ins_encode %{
8134     int vector_len = 1;
8135     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8136   %}
8137   ins_pipe( pipe_slow );
8138 %}
8139 
8140 instruct vdiv4D_mem(vecY dst, vecY src, memory mem) %{
8141   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8142   match(Set dst (DivVD src (LoadVector mem)));
8143   format %{ "vdivpd  $dst,$src,$mem\t! div packed4D" %}
8144   ins_encode %{
8145     int vector_len = 1;
8146     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8147   %}
8148   ins_pipe( pipe_slow );
8149 %}
8150 
8151 instruct vdiv8D_reg(vecZ dst, vecZ src1, vecZ src2) %{
8152   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8153   match(Set dst (DivVD src1 src2));
8154   format %{ "vdivpd  $dst,$src1,$src2\t! div packed8D" %}
8155   ins_encode %{
8156     int vector_len = 2;
8157     __ vdivpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8158   %}
8159   ins_pipe( pipe_slow );
8160 %}
8161 
8162 instruct vdiv8D_mem(vecZ dst, vecZ src, memory mem) %{
8163   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8164   match(Set dst (DivVD src (LoadVector mem)));
8165   format %{ "vdivpd  $dst,$src,$mem\t! div packed8D" %}
8166   ins_encode %{
8167     int vector_len = 2;
8168     __ vdivpd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8169   %}
8170   ins_pipe( pipe_slow );
8171 %}
8172 
8173 // --------------------------------- Sqrt --------------------------------------
8174 
8175 // Floating point vector sqrt
8176 instruct vsqrt2D_reg(vecX dst, vecX src) %{
8177   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8178   match(Set dst (SqrtVD src));
8179   format %{ "vsqrtpd  $dst,$src\t! sqrt packed2D" %}
8180   ins_encode %{
8181     int vector_len = 0;
8182     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8183   %}
8184   ins_pipe( pipe_slow );
8185 %}
8186 
8187 instruct vsqrt2D_mem(vecX dst, memory mem) %{
8188   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8189   match(Set dst (SqrtVD (LoadVector mem)));
8190   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed2D" %}
8191   ins_encode %{
8192     int vector_len = 0;
8193     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8194   %}
8195   ins_pipe( pipe_slow );
8196 %}
8197 
8198 instruct vsqrt4D_reg(vecY dst, vecY src) %{
8199   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8200   match(Set dst (SqrtVD src));
8201   format %{ "vsqrtpd  $dst,$src\t! sqrt packed4D" %}
8202   ins_encode %{
8203     int vector_len = 1;
8204     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8205   %}
8206   ins_pipe( pipe_slow );
8207 %}
8208 
8209 instruct vsqrt4D_mem(vecY dst, memory mem) %{
8210   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8211   match(Set dst (SqrtVD (LoadVector mem)));
8212   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed4D" %}
8213   ins_encode %{
8214     int vector_len = 1;
8215     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8216   %}
8217   ins_pipe( pipe_slow );
8218 %}
8219 
8220 instruct vsqrt8D_reg(vecZ dst, vecZ src) %{
8221   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8222   match(Set dst (SqrtVD src));
8223   format %{ "vsqrtpd  $dst,$src\t! sqrt packed8D" %}
8224   ins_encode %{
8225     int vector_len = 2;
8226     __ vsqrtpd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8227   %}
8228   ins_pipe( pipe_slow );
8229 %}
8230 
8231 instruct vsqrt8D_mem(vecZ dst, memory mem) %{
8232   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8233   match(Set dst (SqrtVD (LoadVector mem)));
8234   format %{ "vsqrtpd  $dst,$mem\t! sqrt packed8D" %}
8235   ins_encode %{
8236     int vector_len = 2;
8237     __ vsqrtpd($dst$$XMMRegister, $mem$$Address, vector_len);
8238   %}
8239   ins_pipe( pipe_slow );
8240 %}
8241 
8242 instruct vsqrt2F_reg(vecD dst, vecD src) %{
8243   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8244   match(Set dst (SqrtVF src));
8245   format %{ "vsqrtps  $dst,$src\t! sqrt packed2F" %}
8246   ins_encode %{
8247     int vector_len = 0;
8248     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8249   %}
8250   ins_pipe( pipe_slow );
8251 %}
8252 
8253 instruct vsqrt2F_mem(vecD dst, memory mem) %{
8254   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
8255   match(Set dst (SqrtVF (LoadVector mem)));
8256   format %{ "vsqrtps  $dst,$mem\t! sqrt packed2F" %}
8257   ins_encode %{
8258     int vector_len = 0;
8259     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8260   %}
8261   ins_pipe( pipe_slow );
8262 %}
8263 
8264 instruct vsqrt4F_reg(vecX dst, vecX src) %{
8265   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8266   match(Set dst (SqrtVF src));
8267   format %{ "vsqrtps  $dst,$src\t! sqrt packed4F" %}
8268   ins_encode %{
8269     int vector_len = 0;
8270     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8271   %}
8272   ins_pipe( pipe_slow );
8273 %}
8274 
8275 instruct vsqrt4F_mem(vecX dst, memory mem) %{
8276   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
8277   match(Set dst (SqrtVF (LoadVector mem)));
8278   format %{ "vsqrtps  $dst,$mem\t! sqrt packed4F" %}
8279   ins_encode %{
8280     int vector_len = 0;
8281     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8282   %}
8283   ins_pipe( pipe_slow );
8284 %}
8285 
8286 instruct vsqrt8F_reg(vecY dst, vecY src) %{
8287   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8288   match(Set dst (SqrtVF src));
8289   format %{ "vsqrtps  $dst,$src\t! sqrt packed8F" %}
8290   ins_encode %{
8291     int vector_len = 1;
8292     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8293   %}
8294   ins_pipe( pipe_slow );
8295 %}
8296 
8297 instruct vsqrt8F_mem(vecY dst, memory mem) %{
8298   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
8299   match(Set dst (SqrtVF (LoadVector mem)));
8300   format %{ "vsqrtps  $dst,$mem\t! sqrt packed8F" %}
8301   ins_encode %{
8302     int vector_len = 1;
8303     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8304   %}
8305   ins_pipe( pipe_slow );
8306 %}
8307 
8308 instruct vsqrt16F_reg(vecZ dst, vecZ src) %{
8309   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8310   match(Set dst (SqrtVF src));
8311   format %{ "vsqrtps  $dst,$src\t! sqrt packed16F" %}
8312   ins_encode %{
8313     int vector_len = 2;
8314     __ vsqrtps($dst$$XMMRegister, $src$$XMMRegister, vector_len);
8315   %}
8316   ins_pipe( pipe_slow );
8317 %}
8318 
8319 instruct vsqrt16F_mem(vecZ dst, memory mem) %{
8320   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8321   match(Set dst (SqrtVF (LoadVector mem)));
8322   format %{ "vsqrtps  $dst,$mem\t! sqrt packed16F" %}
8323   ins_encode %{
8324     int vector_len = 2;
8325     __ vsqrtps($dst$$XMMRegister, $mem$$Address, vector_len);
8326   %}
8327   ins_pipe( pipe_slow );
8328 %}
8329 
8330 // ------------------------------ Shift ---------------------------------------
8331 
8332 // Left and right shift count vectors are the same on x86
8333 // (only lowest bits of xmm reg are used for count).
8334 instruct vshiftcnt(vecS dst, rRegI cnt) %{
8335   match(Set dst (LShiftCntV cnt));
8336   match(Set dst (RShiftCntV cnt));
8337   format %{ "movdl    $dst,$cnt\t! load shift count" %}
8338   ins_encode %{
8339     __ movdl($dst$$XMMRegister, $cnt$$Register);
8340   %}
8341   ins_pipe( pipe_slow );
8342 %}
8343 
8344 instruct vshiftcntimm(vecS dst, immI8 cnt, rRegI tmp) %{
8345   match(Set dst cnt);
8346   effect(TEMP tmp);
8347   format %{ "movl    $tmp,$cnt\t"
8348             "movdl   $dst,$tmp\t! load shift count" %}
8349   ins_encode %{
8350     __ movl($tmp$$Register, $cnt$$constant);
8351     __ movdl($dst$$XMMRegister, $tmp$$Register);
8352   %}
8353   ins_pipe( pipe_slow );
8354 %}
8355 
8356 // Byte vector shift
8357 instruct vshift4B(vecS dst, vecS src, vecS shift, vecS tmp, rRegI scratch) %{
8358   predicate(UseSSE > 3 && n->as_Vector()->length() == 4);
8359   match(Set dst (LShiftVB src shift));
8360   match(Set dst (RShiftVB src shift));
8361   match(Set dst (URShiftVB src shift));
8362   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
8363   format %{"vextendbw $tmp,$src\n\t"
8364            "vshiftw   $tmp,$shift\n\t"
8365            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
8366            "pand      $dst,$tmp\n\t"
8367            "packuswb  $dst,$dst\n\t ! packed4B shift" %}
8368   ins_encode %{
8369     int opcode = this->as_Mach()->ideal_Opcode();
8370 
8371     __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister);
8372     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
8373     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); 
8374     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
8375     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
8376   %}
8377   ins_pipe( pipe_slow );
8378 %}
8379 
8380 instruct vshift8B(vecD dst, vecD src, vecS shift, vecD tmp, rRegI scratch) %{
8381   predicate(UseSSE > 3 && n->as_Vector()->length() == 8);
8382   match(Set dst (LShiftVB src shift));
8383   match(Set dst (RShiftVB src shift));
8384   match(Set dst (URShiftVB src shift));
8385   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
8386   format %{"vextendbw $tmp,$src\n\t"
8387            "vshiftw   $tmp,$shift\n\t"
8388            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
8389            "pand      $dst,$tmp\n\t"
8390            "packuswb  $dst,$dst\n\t ! packed8B shift" %}
8391   ins_encode %{
8392     int opcode = this->as_Mach()->ideal_Opcode();
8393 
8394     __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister);
8395     __ vshiftw(opcode, $tmp$$XMMRegister, $shift$$XMMRegister);
8396     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register); 
8397     __ pand($dst$$XMMRegister, $tmp$$XMMRegister);
8398     __ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
8399   %}
8400   ins_pipe( pipe_slow );
8401 %}
8402 
8403 instruct vshift16B(vecX dst, vecX src, vecS shift, vecX tmp1, vecX tmp2, rRegI scratch) %{
8404   predicate(UseSSE > 3  && UseAVX <= 1 && n->as_Vector()->length() == 16);
8405   match(Set dst (LShiftVB src shift));
8406   match(Set dst (RShiftVB src shift));
8407   match(Set dst (URShiftVB src shift));
8408   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
8409   format %{"vextendbw $tmp1,$src\n\t"
8410            "vshiftw   $tmp1,$shift\n\t"
8411            "pshufd    $tmp2,$src\n\t"
8412            "vextendbw $tmp2,$tmp2\n\t"
8413            "vshiftw   $tmp2,$shift\n\t"
8414            "movdqu    $dst,[0x00ff00ff0x00ff00ff]\n\t"
8415            "pand      $tmp2,$dst\n\t"
8416            "pand      $dst,$tmp1\n\t"
8417            "packuswb  $dst,$tmp2\n\t! packed16B shift" %}
8418   ins_encode %{
8419     int opcode = this->as_Mach()->ideal_Opcode();
8420 
8421     __ vextendbw(opcode, $tmp1$$XMMRegister, $src$$XMMRegister);
8422     __ vshiftw(opcode, $tmp1$$XMMRegister, $shift$$XMMRegister);
8423     __ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
8424     __ vextendbw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
8425     __ vshiftw(opcode, $tmp2$$XMMRegister, $shift$$XMMRegister);
8426     __ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
8427     __ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
8428     __ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
8429     __ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
8430   %}
8431   ins_pipe( pipe_slow );
8432 %}
8433 
8434 instruct vshift16B_avx(vecX dst, vecX src, vecS shift, vecX tmp, rRegI scratch) %{
8435   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8436   match(Set dst (LShiftVB src shift));
8437   match(Set dst (RShiftVB src shift));
8438   match(Set dst (URShiftVB src shift));
8439   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
8440   format %{"vextendbw  $tmp,$src\n\t"
8441            "vshiftw    $tmp,$tmp,$shift\n\t"
8442            "vpand      $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
8443            "vextracti128_high  $dst,$tmp\n\t"
8444            "vpackuswb  $dst,$tmp,$dst\n\t! packed16B shift" %}
8445   ins_encode %{
8446     int opcode = this->as_Mach()->ideal_Opcode();
8447 
8448     int vector_len = 1;
8449     __ vextendbw(opcode, $tmp$$XMMRegister, $src$$XMMRegister, vector_len);
8450     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
8451     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
8452     __ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
8453     __ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
8454   %}
8455   ins_pipe( pipe_slow );
8456 %}
8457 
8458 instruct vshift32B_avx(vecY dst, vecY src, vecS shift, vecY tmp, rRegI scratch) %{
8459   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
8460   match(Set dst (LShiftVB src shift));
8461   match(Set dst (RShiftVB src shift));
8462   match(Set dst (URShiftVB src shift));
8463   effect(TEMP dst, USE src, USE shift, TEMP tmp, TEMP scratch);
8464   format %{"vextracti128_high  $tmp,$src\n\t"
8465            "vextendbw  $tmp,$tmp\n\t"
8466            "vextendbw  $dst,$src\n\t"
8467            "vshiftw    $tmp,$tmp,$shift\n\t"
8468            "vshiftw    $dst,$dst,$shift\n\t"
8469            "vpand      $tmp,$tmp,[0x00ff00ff0x00ff00ff]\n\t"
8470            "vpand      $dst,$dst,[0x00ff00ff0x00ff00ff]\n\t"
8471            "vpackuswb  $dst,$dst,$tmp\n\t"
8472            "vpermq     $dst,$dst,0xD8\n\t! packed32B shift" %}
8473   ins_encode %{
8474     int opcode = this->as_Mach()->ideal_Opcode();
8475 
8476     int vector_len = 1;
8477     __ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
8478     __ vextendbw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, vector_len);
8479     __ vextendbw(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len);
8480     __ vshiftw(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
8481     __ vshiftw(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$XMMRegister, vector_len);
8482     __ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
8483     __ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vector_len, $scratch$$Register);
8484     __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8485     __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vector_len);
8486   %}
8487   ins_pipe( pipe_slow );
8488 %}
8489 
8490 instruct vshift64B_avx(vecZ dst, vecZ src, vecS shift, vecZ tmp1, vecZ tmp2, rRegI scratch) %{
8491   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
8492   match(Set dst (LShiftVB src shift));
8493   match(Set dst (RShiftVB src shift));
8494   match(Set dst (URShiftVB src shift));
8495   effect(TEMP dst, USE src, USE shift, TEMP tmp1, TEMP tmp2, TEMP scratch);
8496   format %{"vextracti64x4  $tmp1,$src\n\t"
8497            "vextendbw      $tmp1,$tmp1\n\t"
8498            "vextendbw      $tmp2,$src\n\t"
8499            "vshiftw        $tmp1,$tmp1,$shift\n\t"
8500            "vshiftw        $tmp2,$tmp2,$shift\n\t"
8501            "vmovdqu        $dst,[0x00ff00ff0x00ff00ff]\n\t"
8502            "vpbroadcastd   $dst,$dst\n\t"
8503            "vpand          $tmp1,$tmp1,$dst\n\t"
8504            "vpand          $tmp2,$tmp2,$dst\n\t"
8505            "vpackuswb      $dst,$tmp1,$tmp2\n\t"
8506            "evmovdquq      $tmp2, [0x0604020007050301]\n\t"
8507            "vpermq         $dst,$tmp2,$dst\n\t! packed64B shift" %}
8508   ins_encode %{
8509     int opcode = this->as_Mach()->ideal_Opcode();
8510 
8511     int vector_len = 2;
8512     __ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
8513     __ vextendbw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vector_len);
8514     __ vextendbw(opcode, $tmp2$$XMMRegister, $src$$XMMRegister, vector_len);
8515     __ vshiftw(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$XMMRegister, vector_len);
8516     __ vshiftw(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$XMMRegister, vector_len);
8517     __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
8518     __ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
8519     __ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vector_len);
8520     __ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
8521     __ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vector_len);
8522     __ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vector_len, $scratch$$Register);
8523     __ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vector_len);
8524   %}
8525   ins_pipe( pipe_slow );
8526 %}
8527 
8528 // Shorts vector logical right shift produces incorrect Java result
8529 // for negative data because java code convert short value into int with
8530 // sign extension before a shift. But char vectors are fine since chars are
8531 // unsigned values.
8532 // Shorts/Chars vector left shift
8533 instruct vshist2S(vecS dst, vecS src, vecS shift) %{
8534   predicate(n->as_Vector()->length() == 2);
8535   match(Set dst (LShiftVS src shift));
8536   match(Set dst (RShiftVS src shift));
8537   match(Set dst (URShiftVS src shift));
8538   effect(TEMP dst, USE src, USE shift);
8539   format %{ "vshiftw  $dst,$src,$shift\t! shift packed2S" %}
8540   ins_encode %{
8541     int opcode = this->as_Mach()->ideal_Opcode();
8542     if (UseAVX == 0) { 
8543       if ($dst$$XMMRegister != $src$$XMMRegister)
8544          __ movflt($dst$$XMMRegister, $src$$XMMRegister);
8545       __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8546     } else {
8547       int vector_len = 0;
8548       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8549     }
8550   %}
8551   ins_pipe( pipe_slow );
8552 %}
8553 
8554 instruct vshift4S(vecD dst, vecD src, vecS shift) %{
8555   predicate(n->as_Vector()->length() == 4);
8556   match(Set dst (LShiftVS src shift));
8557   match(Set dst (RShiftVS src shift));
8558   match(Set dst (URShiftVS src shift));
8559   effect(TEMP dst, USE src, USE shift);
8560   format %{ "vshiftw  $dst,$src,$shift\t! shift packed4S" %}
8561   ins_encode %{
8562     int opcode = this->as_Mach()->ideal_Opcode();
8563     if (UseAVX == 0) { 
8564       if ($dst$$XMMRegister != $src$$XMMRegister)
8565          __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
8566       __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8567     
8568     } else {
8569       int vector_len = 0;
8570       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8571     }
8572   %}
8573   ins_pipe( pipe_slow );
8574 %}
8575 
8576 instruct vshift8S(vecX dst, vecX src, vecS shift) %{
8577   predicate(n->as_Vector()->length() == 8);
8578   match(Set dst (LShiftVS src shift));
8579   match(Set dst (RShiftVS src shift));
8580   match(Set dst (URShiftVS src shift));
8581   effect(TEMP dst, USE src, USE shift);
8582   format %{ "vshiftw  $dst,$src,$shift\t! shift packed8S" %}
8583   ins_encode %{
8584     int opcode = this->as_Mach()->ideal_Opcode();
8585     if (UseAVX == 0) { 
8586       if ($dst$$XMMRegister != $src$$XMMRegister)
8587          __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
8588       __ vshiftw(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8589     } else {
8590       int vector_len = 0;
8591       __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8592     }
8593   %}
8594   ins_pipe( pipe_slow );
8595 %}
8596 
8597 instruct vshift16S(vecY dst, vecY src, vecS shift) %{
8598   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
8599   match(Set dst (LShiftVS src shift));
8600   match(Set dst (RShiftVS src shift));
8601   match(Set dst (URShiftVS src shift));
8602   effect(DEF dst, USE src, USE shift);
8603   format %{ "vshiftw  $dst,$src,$shift\t! shift packed16S" %}
8604   ins_encode %{
8605     int vector_len = 1;
8606     int opcode = this->as_Mach()->ideal_Opcode();
8607     __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8608   %}
8609   ins_pipe( pipe_slow );
8610 %}
8611 
8612 instruct vshift32S(vecZ dst, vecZ src, vecS shift) %{
8613   predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 32);
8614   match(Set dst (LShiftVS src shift));
8615   match(Set dst (RShiftVS src shift));
8616   match(Set dst (URShiftVS src shift));
8617   effect(DEF dst, USE src, USE shift);
8618   format %{ "vshiftw  $dst,$src,$shift\t! shift packed32S" %}
8619   ins_encode %{
8620     int vector_len = 2;
8621     int opcode = this->as_Mach()->ideal_Opcode();
8622     __ vshiftw(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8623   %}
8624   ins_pipe( pipe_slow );
8625 %}
8626 
8627 // Integers vector left shift
8628 instruct vshift2I(vecD dst, vecD src, vecS shift) %{
8629   predicate(n->as_Vector()->length() == 2);
8630   match(Set dst (LShiftVI src shift));
8631   match(Set dst (RShiftVI src shift));
8632   match(Set dst (URShiftVI src shift));
8633   effect(TEMP dst, USE src, USE shift);
8634   format %{ "vshiftd  $dst,$src,$shift\t! shift packed2I" %}
8635   ins_encode %{
8636     int opcode = this->as_Mach()->ideal_Opcode();
8637     if (UseAVX == 0) { 
8638       if ($dst$$XMMRegister != $src$$XMMRegister)
8639          __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
8640       __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8641     } else {
8642       int vector_len = 0;
8643       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8644     }
8645   %}
8646   ins_pipe( pipe_slow );
8647 %}
8648 
8649 instruct vshift4I(vecX dst, vecX src, vecS shift) %{
8650   predicate(n->as_Vector()->length() == 4);
8651   match(Set dst (LShiftVI src shift));
8652   match(Set dst (RShiftVI src shift));
8653   match(Set dst (URShiftVI src shift));
8654   effect(TEMP dst, USE src, USE shift);
8655   format %{ "vshiftd  $dst,$src,$shift\t! shift packed4I" %}
8656   ins_encode %{
8657     int opcode = this->as_Mach()->ideal_Opcode();
8658     if (UseAVX == 0) { 
8659       if ($dst$$XMMRegister != $src$$XMMRegister)
8660          __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
8661       __ vshiftd(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8662     } else {
8663       int vector_len = 0;
8664       __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8665     }
8666   %}
8667   ins_pipe( pipe_slow );
8668 %}
8669 
8670 instruct vshift8I(vecY dst, vecY src, vecS shift) %{
8671   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
8672   match(Set dst (LShiftVI src shift));
8673   match(Set dst (RShiftVI src shift));
8674   match(Set dst (URShiftVI src shift));
8675   effect(DEF dst, USE src, USE shift);
8676   format %{ "vshiftd  $dst,$src,$shift\t! shift packed8I" %}
8677   ins_encode %{
8678     int vector_len = 1;
8679     int opcode = this->as_Mach()->ideal_Opcode();
8680     __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8681   %}
8682   ins_pipe( pipe_slow );
8683 %}
8684 
8685 instruct vshift16I(vecZ dst, vecZ src, vecS shift) %{
8686   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
8687   match(Set dst (LShiftVI src shift));
8688   match(Set dst (RShiftVI src shift));
8689   match(Set dst (URShiftVI src shift));
8690   effect(DEF dst, USE src, USE shift);
8691   format %{ "vshiftd  $dst,$src,$shift\t! shift packed16I" %}
8692   ins_encode %{
8693     int vector_len = 2;
8694     int opcode = this->as_Mach()->ideal_Opcode();
8695     __ vshiftd(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8696   %}
8697   ins_pipe( pipe_slow );
8698 %}
8699 
8700 // Longs vector shift
8701 instruct vshift2L(vecX dst, vecX src, vecS shift) %{
8702   predicate(n->as_Vector()->length() == 2);
8703   match(Set dst (LShiftVL src shift));
8704   match(Set dst (URShiftVL src shift));
8705   effect(TEMP dst, USE src, USE shift);
8706   format %{ "vshiftq  $dst,$src,$shift\t! shift packed2L" %}
8707   ins_encode %{
8708     int opcode = this->as_Mach()->ideal_Opcode();
8709     if (UseAVX == 0) { 
8710       if ($dst$$XMMRegister != $src$$XMMRegister)
8711          __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
8712       __ vshiftq(opcode, $dst$$XMMRegister, $shift$$XMMRegister);
8713     } else {
8714       int vector_len = 0;
8715       __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8716     }
8717   %}
8718   ins_pipe( pipe_slow );
8719 %}
8720 
8721 instruct vshift4L(vecY dst, vecY src, vecS shift) %{
8722   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8723   match(Set dst (LShiftVL src shift));
8724   match(Set dst (URShiftVL src shift));
8725   effect(DEF dst, USE src, USE shift);
8726   format %{ "vshiftq  $dst,$src,$shift\t! left shift packed4L" %}
8727   ins_encode %{
8728     int vector_len = 1;
8729     int opcode = this->as_Mach()->ideal_Opcode();
8730     __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8731   %}
8732   ins_pipe( pipe_slow );
8733 %}
8734 
8735 instruct vshift8L(vecZ dst, vecZ src, vecS shift) %{
8736   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
8737   match(Set dst (LShiftVL src shift));
8738   match(Set dst (RShiftVL src shift));
8739   match(Set dst (URShiftVL src shift));
8740   effect(DEF dst, USE src, USE shift);
8741   format %{ "vshiftq  $dst,$src,$shift\t! shift packed8L" %}
8742   ins_encode %{
8743     int vector_len = 2;
8744     int opcode = this->as_Mach()->ideal_Opcode();
8745     __ vshiftq(opcode, $dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8746   %}
8747   ins_pipe( pipe_slow );
8748 %}
8749 
8750 // -------------------ArithmeticRightShift -----------------------------------
8751 // Long vector arithmetic right shift
8752 instruct vsra2L_reg(vecX dst, vecX src, vecS shift, vecX tmp, rRegI scratch) %{
8753   predicate(UseSSE >= 2 && n->as_Vector()->length() == 2);
8754   match(Set dst (RShiftVL src shift));
8755   effect(TEMP dst, TEMP tmp, TEMP scratch);
8756   format %{ "movdqu  $dst,$src\n\t"
8757             "psrlq   $dst,$shift\n\t"
8758             "movdqu  $tmp,[0x8000000000000000]\n\t"
8759             "psrlq   $tmp,$shift\n\t"
8760             "pxor    $dst,$tmp\n\t"
8761             "psubq   $dst,$tmp\t! arithmetic right shift packed2L" %}
8762   ins_encode %{
8763     __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
8764     __ psrlq($dst$$XMMRegister, $shift$$XMMRegister);
8765     __ movdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
8766     __ psrlq($tmp$$XMMRegister, $shift$$XMMRegister);
8767     __ pxor($dst$$XMMRegister, $tmp$$XMMRegister);
8768     __ psubq($dst$$XMMRegister, $tmp$$XMMRegister);
8769   %}
8770   ins_pipe( pipe_slow );
8771 %}
8772 
8773 instruct vsra2L_reg_evex(vecX dst, vecX src, vecS shift) %{
8774   predicate(UseAVX > 2 && n->as_Vector()->length() == 2);
8775   match(Set dst (RShiftVL src shift));
8776   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed2L" %}
8777   ins_encode %{
8778     int vector_len = 0;
8779     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8780   %}
8781   ins_pipe( pipe_slow );
8782 %}
8783 
8784 instruct vsra4L_reg(vecY dst, vecY src, vecS shift, vecY tmp, rRegI scratch) %{
8785   predicate(UseAVX > 1 && n->as_Vector()->length() == 4);
8786   match(Set dst (RShiftVL src shift));
8787   effect(TEMP dst, TEMP tmp, TEMP scratch);
8788   format %{ "vpsrlq   $dst,$src,$shift\n\t"
8789             "vmovdqu  $tmp,[0x8000000000000000]\n\t"
8790             "vpsrlq   $tmp,$tmp,$shift\n\t"
8791             "vpxor    $dst,$dst,$tmp\n\t"
8792             "vpsubq   $dst,$dst,$tmp\t! arithmetic right shift packed4L" %}
8793   ins_encode %{
8794     int vector_len = 1;
8795     __ vpsrlq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8796     __ vmovdqu($tmp$$XMMRegister, ExternalAddress(vector_long_sign_mask()), $scratch$$Register);
8797     __ vpsrlq($tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$XMMRegister, vector_len);
8798     __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8799     __ vpsubq($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vector_len);
8800   %}
8801   ins_pipe( pipe_slow );
8802 %}
8803 
8804 instruct vsra4L_reg_evex(vecY dst, vecY src, vecS shift) %{
8805   predicate(UseAVX > 2 && n->as_Vector()->length() == 4);
8806   match(Set dst (RShiftVL src shift));
8807   format %{ "evpsraq  $dst,$src,$shift\t! arithmetic right shift packed4L" %}
8808   ins_encode %{
8809     int vector_len = 1;
8810     __ evpsraq($dst$$XMMRegister, $src$$XMMRegister, $shift$$XMMRegister, vector_len);
8811   %}
8812   ins_pipe( pipe_slow );
8813 %}
8814 
8815 // --------------------------------- AND --------------------------------------
8816 
8817 instruct vand4B(vecS dst, vecS src) %{
8818   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
8819   match(Set dst (AndV dst src));
8820   format %{ "pand    $dst,$src\t! and vectors (4 bytes)" %}
8821   ins_encode %{
8822     __ pand($dst$$XMMRegister, $src$$XMMRegister);
8823   %}
8824   ins_pipe( pipe_slow );
8825 %}
8826 
8827 instruct vand4B_reg(vecS dst, vecS src1, vecS src2) %{
8828   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8829   match(Set dst (AndV src1 src2));
8830   format %{ "vpand   $dst,$src1,$src2\t! and vectors (4 bytes)" %}
8831   ins_encode %{
8832     int vector_len = 0;
8833     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8834   %}
8835   ins_pipe( pipe_slow );
8836 %}
8837 
8838 instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
8839   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8840   match(Set dst (AndV src (LoadVector mem)));
8841   format %{ "vpand   $dst,$src,$mem\t! and vectors (4 bytes)" %}
8842   ins_encode %{
8843     int vector_len = 0;
8844     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8845   %}
8846   ins_pipe( pipe_slow );
8847 %}
8848 
8849 instruct vand8B(vecD dst, vecD src) %{
8850   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
8851   match(Set dst (AndV dst src));
8852   format %{ "pand    $dst,$src\t! and vectors (8 bytes)" %}
8853   ins_encode %{
8854     __ pand($dst$$XMMRegister, $src$$XMMRegister);
8855   %}
8856   ins_pipe( pipe_slow );
8857 %}
8858 
8859 instruct vand8B_reg(vecD dst, vecD src1, vecD src2) %{
8860   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
8861   match(Set dst (AndV src1 src2));
8862   format %{ "vpand   $dst,$src1,$src2\t! and vectors (8 bytes)" %}
8863   ins_encode %{
8864     int vector_len = 0;
8865     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8866   %}
8867   ins_pipe( pipe_slow );
8868 %}
8869 
8870 instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
8871   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
8872   match(Set dst (AndV src (LoadVector mem)));
8873   format %{ "vpand   $dst,$src,$mem\t! and vectors (8 bytes)" %}
8874   ins_encode %{
8875     int vector_len = 0;
8876     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8877   %}
8878   ins_pipe( pipe_slow );
8879 %}
8880 
8881 instruct vand16B(vecX dst, vecX src) %{
8882   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
8883   match(Set dst (AndV dst src));
8884   format %{ "pand    $dst,$src\t! and vectors (16 bytes)" %}
8885   ins_encode %{
8886     __ pand($dst$$XMMRegister, $src$$XMMRegister);
8887   %}
8888   ins_pipe( pipe_slow );
8889 %}
8890 
8891 instruct vand16B_reg(vecX dst, vecX src1, vecX src2) %{
8892   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
8893   match(Set dst (AndV src1 src2));
8894   format %{ "vpand   $dst,$src1,$src2\t! and vectors (16 bytes)" %}
8895   ins_encode %{
8896     int vector_len = 0;
8897     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8898   %}
8899   ins_pipe( pipe_slow );
8900 %}
8901 
8902 instruct vand16B_mem(vecX dst, vecX src, memory mem) %{
8903   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
8904   match(Set dst (AndV src (LoadVector mem)));
8905   format %{ "vpand   $dst,$src,$mem\t! and vectors (16 bytes)" %}
8906   ins_encode %{
8907     int vector_len = 0;
8908     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8909   %}
8910   ins_pipe( pipe_slow );
8911 %}
8912 
8913 instruct vand32B_reg(vecY dst, vecY src1, vecY src2) %{
8914   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
8915   match(Set dst (AndV src1 src2));
8916   format %{ "vpand   $dst,$src1,$src2\t! and vectors (32 bytes)" %}
8917   ins_encode %{
8918     int vector_len = 1;
8919     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8920   %}
8921   ins_pipe( pipe_slow );
8922 %}
8923 
8924 instruct vand32B_mem(vecY dst, vecY src, memory mem) %{
8925   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
8926   match(Set dst (AndV src (LoadVector mem)));
8927   format %{ "vpand   $dst,$src,$mem\t! and vectors (32 bytes)" %}
8928   ins_encode %{
8929     int vector_len = 1;
8930     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8931   %}
8932   ins_pipe( pipe_slow );
8933 %}
8934 
8935 instruct vand64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
8936   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
8937   match(Set dst (AndV src1 src2));
8938   format %{ "vpand   $dst,$src1,$src2\t! and vectors (64 bytes)" %}
8939   ins_encode %{
8940     int vector_len = 2;
8941     __ vpand($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8942   %}
8943   ins_pipe( pipe_slow );
8944 %}
8945 
8946 instruct vand64B_mem(vecZ dst, vecZ src, memory mem) %{
8947   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
8948   match(Set dst (AndV src (LoadVector mem)));
8949   format %{ "vpand   $dst,$src,$mem\t! and vectors (64 bytes)" %}
8950   ins_encode %{
8951     int vector_len = 2;
8952     __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8953   %}
8954   ins_pipe( pipe_slow );
8955 %}
8956 
8957 // --------------------------------- OR ---------------------------------------
8958 
8959 instruct vor4B(vecS dst, vecS src) %{
8960   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
8961   match(Set dst (OrV dst src));
8962   format %{ "por     $dst,$src\t! or vectors (4 bytes)" %}
8963   ins_encode %{
8964     __ por($dst$$XMMRegister, $src$$XMMRegister);
8965   %}
8966   ins_pipe( pipe_slow );
8967 %}
8968 
8969 instruct vor4B_reg(vecS dst, vecS src1, vecS src2) %{
8970   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8971   match(Set dst (OrV src1 src2));
8972   format %{ "vpor    $dst,$src1,$src2\t! or vectors (4 bytes)" %}
8973   ins_encode %{
8974     int vector_len = 0;
8975     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
8976   %}
8977   ins_pipe( pipe_slow );
8978 %}
8979 
8980 instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
8981   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
8982   match(Set dst (OrV src (LoadVector mem)));
8983   format %{ "vpor    $dst,$src,$mem\t! or vectors (4 bytes)" %}
8984   ins_encode %{
8985     int vector_len = 0;
8986     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
8987   %}
8988   ins_pipe( pipe_slow );
8989 %}
8990 
8991 instruct vor8B(vecD dst, vecD src) %{
8992   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
8993   match(Set dst (OrV dst src));
8994   format %{ "por     $dst,$src\t! or vectors (8 bytes)" %}
8995   ins_encode %{
8996     __ por($dst$$XMMRegister, $src$$XMMRegister);
8997   %}
8998   ins_pipe( pipe_slow );
8999 %}
9000 
9001 instruct vor8B_reg(vecD dst, vecD src1, vecD src2) %{
9002   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9003   match(Set dst (OrV src1 src2));
9004   format %{ "vpor    $dst,$src1,$src2\t! or vectors (8 bytes)" %}
9005   ins_encode %{
9006     int vector_len = 0;
9007     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9008   %}
9009   ins_pipe( pipe_slow );
9010 %}
9011 
9012 instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
9013   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9014   match(Set dst (OrV src (LoadVector mem)));
9015   format %{ "vpor    $dst,$src,$mem\t! or vectors (8 bytes)" %}
9016   ins_encode %{
9017     int vector_len = 0;
9018     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9019   %}
9020   ins_pipe( pipe_slow );
9021 %}
9022 
9023 instruct vor16B(vecX dst, vecX src) %{
9024   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9025   match(Set dst (OrV dst src));
9026   format %{ "por     $dst,$src\t! or vectors (16 bytes)" %}
9027   ins_encode %{
9028     __ por($dst$$XMMRegister, $src$$XMMRegister);
9029   %}
9030   ins_pipe( pipe_slow );
9031 %}
9032 
9033 instruct vor16B_reg(vecX dst, vecX src1, vecX src2) %{
9034   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9035   match(Set dst (OrV src1 src2));
9036   format %{ "vpor    $dst,$src1,$src2\t! or vectors (16 bytes)" %}
9037   ins_encode %{
9038     int vector_len = 0;
9039     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9040   %}
9041   ins_pipe( pipe_slow );
9042 %}
9043 
9044 instruct vor16B_mem(vecX dst, vecX src, memory mem) %{
9045   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9046   match(Set dst (OrV src (LoadVector mem)));
9047   format %{ "vpor    $dst,$src,$mem\t! or vectors (16 bytes)" %}
9048   ins_encode %{
9049     int vector_len = 0;
9050     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9051   %}
9052   ins_pipe( pipe_slow );
9053 %}
9054 
9055 instruct vor32B_reg(vecY dst, vecY src1, vecY src2) %{
9056   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9057   match(Set dst (OrV src1 src2));
9058   format %{ "vpor    $dst,$src1,$src2\t! or vectors (32 bytes)" %}
9059   ins_encode %{
9060     int vector_len = 1;
9061     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9062   %}
9063   ins_pipe( pipe_slow );
9064 %}
9065 
9066 instruct vor32B_mem(vecY dst, vecY src, memory mem) %{
9067   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9068   match(Set dst (OrV src (LoadVector mem)));
9069   format %{ "vpor    $dst,$src,$mem\t! or vectors (32 bytes)" %}
9070   ins_encode %{
9071     int vector_len = 1;
9072     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9073   %}
9074   ins_pipe( pipe_slow );
9075 %}
9076 
9077 instruct vor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9078   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9079   match(Set dst (OrV src1 src2));
9080   format %{ "vpor    $dst,$src1,$src2\t! or vectors (64 bytes)" %}
9081   ins_encode %{
9082     int vector_len = 2;
9083     __ vpor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9084   %}
9085   ins_pipe( pipe_slow );
9086 %}
9087 
9088 instruct vor64B_mem(vecZ dst, vecZ src, memory mem) %{
9089   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9090   match(Set dst (OrV src (LoadVector mem)));
9091   format %{ "vpor    $dst,$src,$mem\t! or vectors (64 bytes)" %}
9092   ins_encode %{
9093     int vector_len = 2;
9094     __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9095   %}
9096   ins_pipe( pipe_slow );
9097 %}
9098 
9099 // --------------------------------- XOR --------------------------------------
9100 
9101 instruct vxor4B(vecS dst, vecS src) %{
9102   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 4);
9103   match(Set dst (XorV dst src));
9104   format %{ "pxor    $dst,$src\t! xor vectors (4 bytes)" %}
9105   ins_encode %{
9106     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9107   %}
9108   ins_pipe( pipe_slow );
9109 %}
9110 
9111 instruct vxor4B_reg(vecS dst, vecS src1, vecS src2) %{
9112   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9113   match(Set dst (XorV src1 src2));
9114   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (4 bytes)" %}
9115   ins_encode %{
9116     int vector_len = 0;
9117     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9118   %}
9119   ins_pipe( pipe_slow );
9120 %}
9121 
9122 instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
9123   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
9124   match(Set dst (XorV src (LoadVector mem)));
9125   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (4 bytes)" %}
9126   ins_encode %{
9127     int vector_len = 0;
9128     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9129   %}
9130   ins_pipe( pipe_slow );
9131 %}
9132 
9133 instruct vxor8B(vecD dst, vecD src) %{
9134   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 8);
9135   match(Set dst (XorV dst src));
9136   format %{ "pxor    $dst,$src\t! xor vectors (8 bytes)" %}
9137   ins_encode %{
9138     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9139   %}
9140   ins_pipe( pipe_slow );
9141 %}
9142 
9143 instruct vxor8B_reg(vecD dst, vecD src1, vecD src2) %{
9144   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9145   match(Set dst (XorV src1 src2));
9146   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (8 bytes)" %}
9147   ins_encode %{
9148     int vector_len = 0;
9149     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9150   %}
9151   ins_pipe( pipe_slow );
9152 %}
9153 
9154 instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
9155   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
9156   match(Set dst (XorV src (LoadVector mem)));
9157   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (8 bytes)" %}
9158   ins_encode %{
9159     int vector_len = 0;
9160     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9161   %}
9162   ins_pipe( pipe_slow );
9163 %}
9164 
9165 instruct vxor16B(vecX dst, vecX src) %{
9166   predicate(UseAVX == 0 && n->as_Vector()->length_in_bytes() == 16);
9167   match(Set dst (XorV dst src));
9168   format %{ "pxor    $dst,$src\t! xor vectors (16 bytes)" %}
9169   ins_encode %{
9170     __ pxor($dst$$XMMRegister, $src$$XMMRegister);
9171   %}
9172   ins_pipe( pipe_slow );
9173 %}
9174 
9175 instruct vxor16B_reg(vecX dst, vecX src1, vecX src2) %{
9176   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9177   match(Set dst (XorV src1 src2));
9178   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (16 bytes)" %}
9179   ins_encode %{
9180     int vector_len = 0;
9181     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9182   %}
9183   ins_pipe( pipe_slow );
9184 %}
9185 
9186 instruct vxor16B_mem(vecX dst, vecX src, memory mem) %{
9187   predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 16);
9188   match(Set dst (XorV src (LoadVector mem)));
9189   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (16 bytes)" %}
9190   ins_encode %{
9191     int vector_len = 0;
9192     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9193   %}
9194   ins_pipe( pipe_slow );
9195 %}
9196 
9197 instruct vxor32B_reg(vecY dst, vecY src1, vecY src2) %{
9198   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9199   match(Set dst (XorV src1 src2));
9200   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (32 bytes)" %}
9201   ins_encode %{
9202     int vector_len = 1;
9203     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9204   %}
9205   ins_pipe( pipe_slow );
9206 %}
9207 
9208 instruct vxor32B_mem(vecY dst, vecY src, memory mem) %{
9209   predicate(UseAVX > 1 && n->as_Vector()->length_in_bytes() == 32);
9210   match(Set dst (XorV src (LoadVector mem)));
9211   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (32 bytes)" %}
9212   ins_encode %{
9213     int vector_len = 1;
9214     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9215   %}
9216   ins_pipe( pipe_slow );
9217 %}
9218 
9219 instruct vxor64B_reg(vecZ dst, vecZ src1, vecZ src2) %{
9220   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9221   match(Set dst (XorV src1 src2));
9222   format %{ "vpxor   $dst,$src1,$src2\t! xor vectors (64 bytes)" %}
9223   ins_encode %{
9224     int vector_len = 2;
9225     __ vpxor($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9226   %}
9227   ins_pipe( pipe_slow );
9228 %}
9229 
9230 instruct vxor64B_mem(vecZ dst, vecZ src, memory mem) %{
9231   predicate(UseAVX > 2 && n->as_Vector()->length_in_bytes() == 64);
9232   match(Set dst (XorV src (LoadVector mem)));
9233   format %{ "vpxor   $dst,$src,$mem\t! xor vectors (64 bytes)" %}
9234   ins_encode %{
9235     int vector_len = 2;
9236     __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
9237   %}
9238   ins_pipe( pipe_slow );
9239 %}
9240 
9241 // --------------------------------- ABS --------------------------------------
9242 // a = |a|
9243 instruct vabs4B_reg(vecS dst, vecS src) %{
9244   predicate(UseSSE > 2 && n->as_Vector()->length() == 4);
9245   match(Set dst (AbsVB  src));
9246   format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed4B" %}
9247   ins_encode %{
9248     __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
9249   %}
9250   ins_pipe( pipe_slow );
9251 %}
9252 
9253 instruct vabs8B_reg(vecD dst, vecD src) %{
9254   predicate(UseSSE > 2 && n->as_Vector()->length() == 8);
9255   match(Set dst (AbsVB  src));
9256   format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed8B" %}
9257   ins_encode %{
9258     __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
9259   %}
9260   ins_pipe( pipe_slow );
9261 %}
9262 
9263 instruct vabs16B_reg(vecX dst, vecX src) %{
9264   predicate(UseSSE > 2 && n->as_Vector()->length() == 16);
9265   match(Set dst (AbsVB  src));
9266   format %{ "pabsb $dst,$src\t# $dst = |$src| abs packed16B" %}
9267   ins_encode %{
9268     __ pabsb($dst$$XMMRegister, $src$$XMMRegister);
9269   %}
9270   ins_pipe( pipe_slow );
9271 %}
9272 
9273 instruct vabs32B_reg(vecY dst, vecY src) %{
9274   predicate(UseAVX > 1 && n->as_Vector()->length() == 32);
9275   match(Set dst (AbsVB  src));
9276   format %{ "vpabsb $dst,$src\t# $dst = |$src| abs packed32B" %}
9277   ins_encode %{
9278     int vector_len = 1;
9279     __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9280   %}
9281   ins_pipe( pipe_slow );
9282 %}
9283 
9284 instruct vabs64B_reg(vecZ dst, vecZ src) %{
9285   predicate(UseAVX > 2 && n->as_Vector()->length() == 64);
9286   match(Set dst (AbsVB  src));
9287   format %{ "vpabsb $dst,$src\t# $dst = |$src| abs packed64B" %}
9288   ins_encode %{
9289     int vector_len = 2;
9290     __ vpabsb($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9291   %}
9292   ins_pipe( pipe_slow );
9293 %}
9294 
9295 instruct vabs2S_reg(vecD dst, vecD src) %{
9296   predicate(UseSSE > 2 && n->as_Vector()->length() == 2);
9297   match(Set dst (AbsVS  src));
9298   format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed2S" %}
9299   ins_encode %{
9300     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
9301   %}
9302   ins_pipe( pipe_slow );
9303 %}
9304 
9305 instruct vabs4S_reg(vecD dst, vecD src) %{
9306   predicate(UseSSE > 2 && n->as_Vector()->length() == 4);
9307   match(Set dst (AbsVS  src));
9308   format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed4S" %}
9309   ins_encode %{
9310     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
9311   %}
9312   ins_pipe( pipe_slow );
9313 %}
9314 
9315 instruct vabs8S_reg(vecX dst, vecX src) %{
9316   predicate(UseSSE > 2 && n->as_Vector()->length() == 8);
9317   match(Set dst (AbsVS  src));
9318   format %{ "pabsw $dst,$src\t# $dst = |$src| abs packed8S" %}
9319   ins_encode %{
9320     __ pabsw($dst$$XMMRegister, $src$$XMMRegister);
9321   %}
9322   ins_pipe( pipe_slow );
9323 %}
9324 
9325 instruct vabs16S_reg(vecY dst, vecY src) %{
9326   predicate(UseAVX > 1 && n->as_Vector()->length() == 16);
9327   match(Set dst (AbsVS  src));
9328   format %{ "vpabsw $dst,$src\t# $dst = |$src| abs packed16S" %}
9329   ins_encode %{
9330     int vector_len = 1;
9331     __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9332   %}
9333   ins_pipe( pipe_slow );
9334 %}
9335 
9336 instruct vabs32S_reg(vecZ dst, vecZ src) %{
9337   predicate(UseAVX > 2 && n->as_Vector()->length() == 32);
9338   match(Set dst (AbsVS  src));
9339   format %{ "vpabsw $dst,$src\t# $dst = |$src| abs packed32S" %}
9340   ins_encode %{
9341     int vector_len = 2;
9342     __ vpabsw($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9343   %}
9344   ins_pipe( pipe_slow );
9345 %}
9346 
9347 instruct vabs2I_reg(vecD dst, vecD src) %{
9348   predicate(UseSSE > 2 && n->as_Vector()->length() == 2);
9349   match(Set dst (AbsVI  src));
9350   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packed2I" %}
9351   ins_encode %{
9352     __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
9353   %}
9354   ins_pipe( pipe_slow );
9355 %}
9356 
9357 instruct vabs4I_reg(vecX dst, vecX src) %{
9358   predicate(UseSSE > 2 && n->as_Vector()->length() == 4);
9359   match(Set dst (AbsVI  src));
9360   format %{ "pabsd $dst,$src\t# $dst = |$src| abs packed4I" %}
9361   ins_encode %{
9362     __ pabsd($dst$$XMMRegister, $src$$XMMRegister);
9363   %}
9364   ins_pipe( pipe_slow );
9365 %}
9366 
9367 instruct vabs8I_reg(vecY dst, vecY src) %{
9368   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9369   match(Set dst (AbsVI src));
9370   format %{ "vpabsd $dst,$src\t# $dst = |$src| abs packed8I" %}
9371   ins_encode %{
9372     int vector_len = 1;
9373     __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9374   %}
9375   ins_pipe( pipe_slow );
9376 %}
9377 
9378 instruct vabs16I_reg(vecZ dst, vecZ src) %{
9379   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9380   match(Set dst (AbsVI src));
9381   format %{ "vpabsd $dst,$src\t# $dst = |$src| abs packed16I" %}
9382   ins_encode %{
9383     int vector_len = 2;
9384     __ vpabsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9385   %}
9386   ins_pipe( pipe_slow );
9387 %}
9388 
9389 instruct vabs2L_reg(vecX dst, vecX src) %{
9390   predicate(UseAVX > 2 && n->as_Vector()->length() == 2);
9391   match(Set dst (AbsVL  src));
9392   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed2L" %}
9393   ins_encode %{
9394     int vector_len = 0;
9395     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9396   %}
9397   ins_pipe( pipe_slow );
9398 %}
9399 
9400 instruct vabs4L_reg(vecY dst, vecY src) %{
9401   predicate(UseAVX > 2 && n->as_Vector()->length() == 4);
9402   match(Set dst (AbsVL  src));
9403   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed4L" %}
9404   ins_encode %{
9405     int vector_len = 1;
9406     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9407   %}
9408   ins_pipe( pipe_slow );
9409 %}
9410 
9411 instruct vabs8L_reg(vecZ dst, vecZ src) %{
9412   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9413   match(Set dst (AbsVL  src));
9414   format %{ "evpabsq $dst,$src\t# $dst = |$src| abs packed8L" %}
9415   ins_encode %{
9416     int vector_len = 2;
9417     __ evpabsq($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9418   %}
9419   ins_pipe( pipe_slow );
9420 %}
9421 
9422 // --------------------------------- ABSNEG --------------------------------------
9423 
9424 instruct vabsneg2D(vecX dst, vecX src, rRegI scratch) %{
9425   predicate(UseSSE >= 2 && n->as_Vector()->length() == 2);
9426   match(Set dst (AbsVD  src));
9427   match(Set dst (NegVD  src));
9428   effect(TEMP scratch);
9429   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packed2D" %}
9430   ins_encode %{
9431     int opcode = this->as_Mach()->ideal_Opcode();
9432     if ($dst$$XMMRegister != $src$$XMMRegister)
9433       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
9434     __ vabsnegd(opcode, $dst$$XMMRegister, $scratch$$Register);
9435   %}
9436   ins_pipe( pipe_slow );
9437 %}
9438 
9439 instruct vabsneg4D(vecY dst, vecY src, rRegI scratch) %{
9440   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9441   match(Set dst (AbsVD  src));
9442   match(Set dst (NegVD  src));
9443   effect(TEMP scratch);
9444   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packed4D" %}
9445   ins_encode %{
9446     int opcode = this->as_Mach()->ideal_Opcode();
9447     int vector_len = 1;
9448     __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len, $scratch$$Register);
9449   %}
9450   ins_pipe( pipe_slow );
9451 %}
9452 
9453 instruct vabsneg8D(vecZ dst, vecZ src, rRegI scratch) %{
9454   predicate(UseAVX > 2 && n->as_Vector()->length() == 8);
9455   match(Set dst (AbsVD  src));
9456   match(Set dst (NegVD  src));
9457   effect(TEMP scratch);
9458   format %{ "vabsnegd $dst,$src,[mask]\t# absneg packed8D" %}
9459   ins_encode %{
9460     int opcode = this->as_Mach()->ideal_Opcode();
9461     int vector_len = 2;
9462     __ vabsnegd(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len, $scratch$$Register);
9463   %}
9464   ins_pipe( pipe_slow );
9465 %}
9466 
9467 instruct vabsneg2F(vecD dst, vecD src, rRegI scratch) %{
9468   predicate(UseSSE > 0 && n->as_Vector()->length() == 2);
9469   match(Set dst (AbsVF  src));
9470   match(Set dst (NegVF  src));
9471   effect(TEMP scratch);
9472   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packed2F" %}
9473   ins_encode %{
9474     int opcode = this->as_Mach()->ideal_Opcode();
9475     if ($dst$$XMMRegister != $src$$XMMRegister)
9476       __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
9477     __ vabsnegf(opcode, $dst$$XMMRegister, $scratch$$Register);
9478   %}
9479   ins_pipe( pipe_slow );
9480 %}
9481 
9482 instruct vabsneg4F(vecX dst, rRegI scratch) %{
9483   predicate(UseSSE > 0 && n->as_Vector()->length() == 4);
9484   match(Set dst (AbsVF  dst));
9485   match(Set dst (NegVF  dst));
9486   effect(TEMP scratch);
9487   format %{ "vabsnegf $dst,[mask]\t# absneg packed4F" %}
9488   ins_cost(150);
9489   ins_encode %{
9490     int opcode = this->as_Mach()->ideal_Opcode();
9491     __ vabsnegf(opcode, $dst$$XMMRegister, $scratch$$Register);
9492   %}
9493   ins_pipe( pipe_slow );
9494 %}
9495 
9496 instruct vabsneg8F(vecY dst, vecY src, rRegI scratch) %{
9497   predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
9498   match(Set dst (AbsVF  src));
9499   match(Set dst (NegVF  src));
9500   effect(TEMP scratch);
9501   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packed8F" %}
9502   ins_cost(150);
9503   ins_encode %{
9504     int opcode = this->as_Mach()->ideal_Opcode();
9505     int vector_len = 1;
9506     __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len, $scratch$$Register);
9507   %}
9508   ins_pipe( pipe_slow );
9509 %}
9510 
9511 instruct vabsneg16F(vecZ dst, vecZ src, rRegI scratch) %{
9512   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9513   match(Set dst (AbsVF  src));
9514   match(Set dst (NegVF  src));
9515   effect(TEMP scratch);
9516   format %{ "vabsnegf $dst,$src,[mask]\t# absneg packed16F" %}
9517   ins_cost(150);
9518   ins_encode %{
9519     int opcode = this->as_Mach()->ideal_Opcode();
9520     int vector_len = 2;
9521     __ vabsnegf(opcode, $dst$$XMMRegister, $src$$XMMRegister, vector_len, $scratch$$Register);
9522   %}
9523   ins_pipe( pipe_slow );
9524 %}
9525 
9526 // --------------------------------- FMA --------------------------------------
9527 
9528 // a * b + c
9529 instruct vfma2D_reg(vecX a, vecX b, vecX c) %{
9530   predicate(UseFMA && n->as_Vector()->length() == 2);
9531   match(Set c (FmaVD  c (Binary a b)));
9532   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9533   ins_cost(150);
9534   ins_encode %{
9535     int vector_len = 0;
9536     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9537   %}
9538   ins_pipe( pipe_slow );
9539 %}
9540 
9541 // a * b + c
9542 instruct vfma2D_mem(vecX a, memory b, vecX c) %{
9543   predicate(UseFMA && n->as_Vector()->length() == 2);
9544   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9545   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed2D" %}
9546   ins_cost(150);
9547   ins_encode %{
9548     int vector_len = 0;
9549     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9550   %}
9551   ins_pipe( pipe_slow );
9552 %}
9553 
9554 
9555 // a * b + c
9556 instruct vfma4D_reg(vecY a, vecY b, vecY c) %{
9557   predicate(UseFMA && n->as_Vector()->length() == 4);
9558   match(Set c (FmaVD  c (Binary a b)));
9559   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9560   ins_cost(150);
9561   ins_encode %{
9562     int vector_len = 1;
9563     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9564   %}
9565   ins_pipe( pipe_slow );
9566 %}
9567 
9568 // a * b + c
9569 instruct vfma4D_mem(vecY a, memory b, vecY c) %{
9570   predicate(UseFMA && n->as_Vector()->length() == 4);
9571   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9572   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed4D" %}
9573   ins_cost(150);
9574   ins_encode %{
9575     int vector_len = 1;
9576     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9577   %}
9578   ins_pipe( pipe_slow );
9579 %}
9580 
9581 // a * b + c
9582 instruct vfma8D_reg(vecZ a, vecZ b, vecZ c) %{
9583   predicate(UseFMA && n->as_Vector()->length() == 8);
9584   match(Set c (FmaVD  c (Binary a b)));
9585   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9586   ins_cost(150);
9587   ins_encode %{
9588     int vector_len = 2;
9589     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9590   %}
9591   ins_pipe( pipe_slow );
9592 %}
9593 
9594 // a * b + c
9595 instruct vfma8D_mem(vecZ a, memory b, vecZ c) %{
9596   predicate(UseFMA && n->as_Vector()->length() == 8);
9597   match(Set c (FmaVD  c (Binary a (LoadVector b))));
9598   format %{ "fmapd $a,$b,$c\t# $c = $a * $b + $c fma packed8D" %}
9599   ins_cost(150);
9600   ins_encode %{
9601     int vector_len = 2;
9602     __ vfmad($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9603   %}
9604   ins_pipe( pipe_slow );
9605 %}
9606 
9607 // a * b + c
9608 instruct vfma4F_reg(vecX a, vecX b, vecX c) %{
9609   predicate(UseFMA && n->as_Vector()->length() == 4);
9610   match(Set c (FmaVF  c (Binary a b)));
9611   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9612   ins_cost(150);
9613   ins_encode %{
9614     int vector_len = 0;
9615     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9616   %}
9617   ins_pipe( pipe_slow );
9618 %}
9619 
9620 // a * b + c
9621 instruct vfma4F_mem(vecX a, memory b, vecX c) %{
9622   predicate(UseFMA && n->as_Vector()->length() == 4);
9623   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9624   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed4F" %}
9625   ins_cost(150);
9626   ins_encode %{
9627     int vector_len = 0;
9628     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9629   %}
9630   ins_pipe( pipe_slow );
9631 %}
9632 
9633 // a * b + c
9634 instruct vfma8F_reg(vecY a, vecY b, vecY c) %{
9635   predicate(UseFMA && n->as_Vector()->length() == 8);
9636   match(Set c (FmaVF  c (Binary a b)));
9637   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9638   ins_cost(150);
9639   ins_encode %{
9640     int vector_len = 1;
9641     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9642   %}
9643   ins_pipe( pipe_slow );
9644 %}
9645 
9646 // a * b + c
9647 instruct vfma8F_mem(vecY a, memory b, vecY c) %{
9648   predicate(UseFMA && n->as_Vector()->length() == 8);
9649   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9650   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed8F" %}
9651   ins_cost(150);
9652   ins_encode %{
9653     int vector_len = 1;
9654     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9655   %}
9656   ins_pipe( pipe_slow );
9657 %}
9658 
9659 // a * b + c
9660 instruct vfma16F_reg(vecZ a, vecZ b, vecZ c) %{
9661   predicate(UseFMA && n->as_Vector()->length() == 16);
9662   match(Set c (FmaVF  c (Binary a b)));
9663   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9664   ins_cost(150);
9665   ins_encode %{
9666     int vector_len = 2;
9667     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister, vector_len);
9668   %}
9669   ins_pipe( pipe_slow );
9670 %}
9671 
9672 // a * b + c
9673 instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
9674   predicate(UseFMA && n->as_Vector()->length() == 16);
9675   match(Set c (FmaVF  c (Binary a (LoadVector b))));
9676   format %{ "fmaps $a,$b,$c\t# $c = $a * $b + $c fma packed16F" %}
9677   ins_cost(150);
9678   ins_encode %{
9679     int vector_len = 2;
9680     __ vfmaf($c$$XMMRegister, $a$$XMMRegister, $b$$Address, $c$$XMMRegister, vector_len);
9681   %}
9682   ins_pipe( pipe_slow );
9683 %}
9684 
9685 // --------------------------------- Vector Multiply Add --------------------------------------
9686 
9687 instruct smuladd4S2I_reg(vecD dst, vecD src1) %{
9688   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 2);
9689   match(Set dst (MulAddVS2VI dst src1));
9690   format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed4Sto2I" %}
9691   ins_encode %{
9692     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
9693   %}
9694   ins_pipe( pipe_slow );
9695 %}
9696 
9697 instruct vmuladd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
9698   predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
9699   match(Set dst (MulAddVS2VI src1 src2));
9700   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed4Sto2I" %}
9701   ins_encode %{
9702     int vector_len = 0;
9703     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9704   %}
9705   ins_pipe( pipe_slow );
9706 %}
9707 
9708 instruct smuladd8S4I_reg(vecX dst, vecX src1) %{
9709   predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 4);
9710   match(Set dst (MulAddVS2VI dst src1));
9711   format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed8Sto4I" %}
9712   ins_encode %{
9713     __ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
9714   %}
9715   ins_pipe( pipe_slow );
9716 %}
9717 
9718 instruct vmuladd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
9719   predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
9720   match(Set dst (MulAddVS2VI src1 src2));
9721   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed8Sto4I" %}
9722   ins_encode %{
9723     int vector_len = 0;
9724     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9725   %}
9726   ins_pipe( pipe_slow );
9727 %}
9728 
9729 instruct vmuladd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
9730   predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
9731   match(Set dst (MulAddVS2VI src1 src2));
9732   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed16Sto8I" %}
9733   ins_encode %{
9734     int vector_len = 1;
9735     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9736   %}
9737   ins_pipe( pipe_slow );
9738 %}
9739 
9740 instruct vmuladd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
9741   predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
9742   match(Set dst (MulAddVS2VI src1 src2));
9743   format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed32Sto16I" %}
9744   ins_encode %{
9745     int vector_len = 2;
9746     __ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9747   %}
9748   ins_pipe( pipe_slow );
9749 %}
9750 
9751 // --------------------------------- Vector Multiply Add Add ----------------------------------
9752 
9753 instruct vmuladdadd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
9754   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 2);
9755   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9756   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed4Sto2I" %}
9757   ins_encode %{
9758     int vector_len = 0;
9759     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9760   %}
9761   ins_pipe( pipe_slow );
9762   ins_cost(10);
9763 %}
9764 
9765 instruct vmuladdadd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
9766   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 4);
9767   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9768   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed8Sto4I" %}
9769   ins_encode %{
9770     int vector_len = 0;
9771     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9772   %}
9773   ins_pipe( pipe_slow );
9774   ins_cost(10);
9775 %}
9776 
9777 instruct vmuladdadd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
9778   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 8);
9779   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9780   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed16Sto8I" %}
9781   ins_encode %{
9782     int vector_len = 1;
9783     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9784   %}
9785   ins_pipe( pipe_slow );
9786   ins_cost(10);
9787 %}
9788 
9789 instruct vmuladdadd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
9790   predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 16);
9791   match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
9792   format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed32Sto16I" %}
9793   ins_encode %{
9794     int vector_len = 2;
9795     __ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
9796   %}
9797   ins_pipe( pipe_slow );
9798   ins_cost(10);
9799 %}
9800 
9801 // --------------------------------- PopCount --------------------------------------
9802 
9803 instruct vpopcount2I(vecD dst, vecD src) %{
9804   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
9805   match(Set dst (PopCountVI src));
9806   format %{ "vpopcntd  $dst,$src\t! vector popcount packed2I" %}
9807   ins_encode %{
9808     int vector_len = 0;
9809     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9810   %}
9811   ins_pipe( pipe_slow );
9812 %}
9813 
9814 instruct vpopcount4I(vecX dst, vecX src) %{
9815   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
9816   match(Set dst (PopCountVI src));
9817   format %{ "vpopcntd  $dst,$src\t! vector popcount packed4I" %}
9818   ins_encode %{
9819     int vector_len = 0;
9820     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9821   %}
9822   ins_pipe( pipe_slow );
9823 %}
9824 
9825 instruct vpopcount8I(vecY dst, vecY src) %{
9826   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
9827   match(Set dst (PopCountVI src));
9828   format %{ "vpopcntd  $dst,$src\t! vector popcount packed8I" %}
9829   ins_encode %{
9830     int vector_len = 1;
9831     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9832   %}
9833   ins_pipe( pipe_slow );
9834 %}
9835 
9836 instruct vpopcount16I(vecZ dst, vecZ src) %{
9837   predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
9838   match(Set dst (PopCountVI src));
9839   format %{ "vpopcntd  $dst,$src\t! vector popcount packed16I" %}
9840   ins_encode %{
9841     int vector_len = 2;
9842     __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
9843   %}
9844   ins_pipe( pipe_slow );
9845 %}