1 /* 2 * Copyright (c) 1994, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.lang; 27 28 import java.io.ObjectStreamField; 29 import java.io.UnsupportedEncodingException; 30 import java.lang.annotation.Native; 31 import java.lang.invoke.MethodHandles; 32 import java.lang.constant.Constable; 33 import java.lang.constant.ConstantDesc; 34 import java.nio.ByteBuffer; 35 import java.nio.CharBuffer; 36 import java.nio.charset.*; 37 import java.util.ArrayList; 38 import java.util.Arrays; 39 import java.util.Comparator; 40 import java.util.Formatter; 41 import java.util.List; 42 import java.util.Locale; 43 import java.util.Objects; 44 import java.util.Optional; 45 import java.util.Spliterator; 46 import java.util.function.Function; 47 import java.util.regex.Pattern; 48 import java.util.regex.PatternSyntaxException; 49 import java.util.stream.Collectors; 50 import java.util.stream.IntStream; 51 import java.util.stream.Stream; 52 import java.util.stream.StreamSupport; 53 54 import jdk.internal.util.ArraysSupport; 55 import jdk.internal.util.Preconditions; 56 import jdk.internal.vm.annotation.ForceInline; 57 import jdk.internal.vm.annotation.IntrinsicCandidate; 58 import jdk.internal.vm.annotation.Stable; 59 import sun.nio.cs.ArrayDecoder; 60 import sun.nio.cs.ArrayEncoder; 61 62 import sun.nio.cs.ISO_8859_1; 63 import sun.nio.cs.US_ASCII; 64 import sun.nio.cs.UTF_8; 65 66 /** 67 * The {@code String} class represents character strings. All 68 * string literals in Java programs, such as {@code "abc"}, are 69 * implemented as instances of this class. 70 * <p> 71 * Strings are constant; their values cannot be changed after they 72 * are created. String buffers support mutable strings. 73 * Because String objects are immutable they can be shared. For example: 74 * <blockquote><pre> 75 * String str = "abc"; 76 * </pre></blockquote><p> 77 * is equivalent to: 78 * <blockquote><pre> 79 * char data[] = {'a', 'b', 'c'}; 80 * String str = new String(data); 81 * </pre></blockquote><p> 82 * Here are some more examples of how strings can be used: 83 * <blockquote><pre> 84 * System.out.println("abc"); 85 * String cde = "cde"; 86 * System.out.println("abc" + cde); 87 * String c = "abc".substring(2, 3); 88 * String d = cde.substring(1, 2); 89 * </pre></blockquote> 90 * <p> 91 * The class {@code String} includes methods for examining 92 * individual characters of the sequence, for comparing strings, for 93 * searching strings, for extracting substrings, and for creating a 94 * copy of a string with all characters translated to uppercase or to 95 * lowercase. Case mapping is based on the Unicode Standard version 96 * specified by the {@link java.lang.Character Character} class. 97 * <p> 98 * The Java language provides special support for the string 99 * concatenation operator ( + ), and for conversion of 100 * other objects to strings. For additional information on string 101 * concatenation and conversion, see <i>The Java Language Specification</i>. 102 * 103 * <p> Unless otherwise noted, passing a {@code null} argument to a constructor 104 * or method in this class will cause a {@link NullPointerException} to be 105 * thrown. 106 * 107 * <p>A {@code String} represents a string in the UTF-16 format 108 * in which <em>supplementary characters</em> are represented by <em>surrogate 109 * pairs</em> (see the section <a href="Character.html#unicode">Unicode 110 * Character Representations</a> in the {@code Character} class for 111 * more information). 112 * Index values refer to {@code char} code units, so a supplementary 113 * character uses two positions in a {@code String}. 114 * <p>The {@code String} class provides methods for dealing with 115 * Unicode code points (i.e., characters), in addition to those for 116 * dealing with Unicode code units (i.e., {@code char} values). 117 * 118 * <p>Unless otherwise noted, methods for comparing Strings do not take locale 119 * into account. The {@link java.text.Collator} class provides methods for 120 * finer-grain, locale-sensitive String comparison. 121 * 122 * @implNote The implementation of the string concatenation operator is left to 123 * the discretion of a Java compiler, as long as the compiler ultimately conforms 124 * to <i>The Java Language Specification</i>. For example, the {@code javac} compiler 125 * may implement the operator with {@code StringBuffer}, {@code StringBuilder}, 126 * or {@code java.lang.invoke.StringConcatFactory} depending on the JDK version. The 127 * implementation of string conversion is typically through the method {@code toString}, 128 * defined by {@code Object} and inherited by all classes in Java. 129 * 130 * @author Lee Boynton 131 * @author Arthur van Hoff 132 * @author Martin Buchholz 133 * @author Ulf Zibis 134 * @see java.lang.Object#toString() 135 * @see java.lang.StringBuffer 136 * @see java.lang.StringBuilder 137 * @see java.nio.charset.Charset 138 * @since 1.0 139 * @jls 15.18.1 String Concatenation Operator + 140 */ 141 142 public final class String 143 implements java.io.Serializable, Comparable<String>, CharSequence, 144 Constable, ConstantDesc { 145 146 /** 147 * The value is used for character storage. 148 * 149 * @implNote This field is trusted by the VM, and is a subject to 150 * constant folding if String instance is constant. Overwriting this 151 * field after construction will cause problems. 152 * 153 * Additionally, it is marked with {@link Stable} to trust the contents 154 * of the array. No other facility in JDK provides this functionality (yet). 155 * {@link Stable} is safe here, because value is never null. 156 */ 157 @Stable 158 private final byte[] value; 159 160 /** 161 * The identifier of the encoding used to encode the bytes in 162 * {@code value}. The supported values in this implementation are 163 * 164 * LATIN1 165 * UTF16 166 * 167 * @implNote This field is trusted by the VM, and is a subject to 168 * constant folding if String instance is constant. Overwriting this 169 * field after construction will cause problems. 170 */ 171 private final byte coder; 172 173 /** Cache the hash code for the string */ 174 private int hash; // Default to 0 175 176 /** 177 * Cache if the hash has been calculated as actually being zero, enabling 178 * us to avoid recalculating this. 179 */ 180 private boolean hashIsZero; // Default to false; 181 182 /** use serialVersionUID from JDK 1.0.2 for interoperability */ 183 @java.io.Serial 184 private static final long serialVersionUID = -6849794470754667710L; 185 186 /** 187 * If String compaction is disabled, the bytes in {@code value} are 188 * always encoded in UTF16. 189 * 190 * For methods with several possible implementation paths, when String 191 * compaction is disabled, only one code path is taken. 192 * 193 * The instance field value is generally opaque to optimizing JIT 194 * compilers. Therefore, in performance-sensitive place, an explicit 195 * check of the static boolean {@code COMPACT_STRINGS} is done first 196 * before checking the {@code coder} field since the static boolean 197 * {@code COMPACT_STRINGS} would be constant folded away by an 198 * optimizing JIT compiler. The idioms for these cases are as follows. 199 * 200 * For code such as: 201 * 202 * if (coder == LATIN1) { ... } 203 * 204 * can be written more optimally as 205 * 206 * if (coder() == LATIN1) { ... } 207 * 208 * or: 209 * 210 * if (COMPACT_STRINGS && coder == LATIN1) { ... } 211 * 212 * An optimizing JIT compiler can fold the above conditional as: 213 * 214 * COMPACT_STRINGS == true => if (coder == LATIN1) { ... } 215 * COMPACT_STRINGS == false => if (false) { ... } 216 * 217 * @implNote 218 * The actual value for this field is injected by JVM. The static 219 * initialization block is used to set the value here to communicate 220 * that this static final field is not statically foldable, and to 221 * avoid any possible circular dependency during vm initialization. 222 */ 223 static final boolean COMPACT_STRINGS; 224 225 static { 226 COMPACT_STRINGS = true; 227 } 228 229 /** 230 * Class String is special cased within the Serialization Stream Protocol. 231 * 232 * A String instance is written into an ObjectOutputStream according to 233 * <a href="{@docRoot}/../specs/serialization/protocol.html#stream-elements"> 234 * <cite>Java Object Serialization Specification</cite>, Section 6.2, "Stream Elements"</a> 235 */ 236 @java.io.Serial 237 private static final ObjectStreamField[] serialPersistentFields = 238 new ObjectStreamField[0]; 239 240 /** 241 * Initializes a newly created {@code String} object so that it represents 242 * an empty character sequence. Note that use of this constructor is 243 * unnecessary since Strings are immutable. 244 */ 245 public String() { 246 this.value = "".value; 247 this.coder = "".coder; 248 } 249 250 /** 251 * Initializes a newly created {@code String} object so that it represents 252 * the same sequence of characters as the argument; in other words, the 253 * newly created string is a copy of the argument string. Unless an 254 * explicit copy of {@code original} is needed, use of this constructor is 255 * unnecessary since Strings are immutable. 256 * 257 * @param original 258 * A {@code String} 259 */ 260 @IntrinsicCandidate 261 public String(String original) { 262 this.value = original.value; 263 this.coder = original.coder; 264 this.hash = original.hash; 265 this.hashIsZero = original.hashIsZero; 266 } 267 268 /** 269 * Allocates a new {@code String} so that it represents the sequence of 270 * characters currently contained in the character array argument. The 271 * contents of the character array are copied; subsequent modification of 272 * the character array does not affect the newly created string. 273 * 274 * @param value 275 * The initial value of the string 276 */ 277 public String(char[] value) { 278 this(value, 0, value.length, null); 279 } 280 281 /** 282 * Allocates a new {@code String} that contains characters from a subarray 283 * of the character array argument. The {@code offset} argument is the 284 * index of the first character of the subarray and the {@code count} 285 * argument specifies the length of the subarray. The contents of the 286 * subarray are copied; subsequent modification of the character array does 287 * not affect the newly created string. 288 * 289 * @param value 290 * Array that is the source of characters 291 * 292 * @param offset 293 * The initial offset 294 * 295 * @param count 296 * The length 297 * 298 * @throws IndexOutOfBoundsException 299 * If {@code offset} is negative, {@code count} is negative, or 300 * {@code offset} is greater than {@code value.length - count} 301 */ 302 public String(char[] value, int offset, int count) { 303 this(value, offset, count, rangeCheck(value, offset, count)); 304 } 305 306 private static Void rangeCheck(char[] value, int offset, int count) { 307 checkBoundsOffCount(offset, count, value.length); 308 return null; 309 } 310 311 /** 312 * Allocates a new {@code String} that contains characters from a subarray 313 * of the <a href="Character.html#unicode">Unicode code point</a> array 314 * argument. The {@code offset} argument is the index of the first code 315 * point of the subarray and the {@code count} argument specifies the 316 * length of the subarray. The contents of the subarray are converted to 317 * {@code char}s; subsequent modification of the {@code int} array does not 318 * affect the newly created string. 319 * 320 * @param codePoints 321 * Array that is the source of Unicode code points 322 * 323 * @param offset 324 * The initial offset 325 * 326 * @param count 327 * The length 328 * 329 * @throws IllegalArgumentException 330 * If any invalid Unicode code point is found in {@code 331 * codePoints} 332 * 333 * @throws IndexOutOfBoundsException 334 * If {@code offset} is negative, {@code count} is negative, or 335 * {@code offset} is greater than {@code codePoints.length - count} 336 * 337 * @since 1.5 338 */ 339 public String(int[] codePoints, int offset, int count) { 340 checkBoundsOffCount(offset, count, codePoints.length); 341 if (count == 0) { 342 this.value = "".value; 343 this.coder = "".coder; 344 return; 345 } 346 if (COMPACT_STRINGS) { 347 byte[] val = StringLatin1.toBytes(codePoints, offset, count); 348 if (val != null) { 349 this.coder = LATIN1; 350 this.value = val; 351 return; 352 } 353 } 354 this.coder = UTF16; 355 this.value = StringUTF16.toBytes(codePoints, offset, count); 356 } 357 358 /** 359 * Allocates a new {@code String} constructed from a subarray of an array 360 * of 8-bit integer values. 361 * 362 * <p> The {@code offset} argument is the index of the first byte of the 363 * subarray, and the {@code count} argument specifies the length of the 364 * subarray. 365 * 366 * <p> Each {@code byte} in the subarray is converted to a {@code char} as 367 * specified in the {@link #String(byte[],int) String(byte[],int)} constructor. 368 * 369 * @deprecated This method does not properly convert bytes into characters. 370 * As of JDK 1.1, the preferred way to do this is via the 371 * {@code String} constructors that take a {@link Charset}, charset name, 372 * or that use the {@link Charset#defaultCharset() default charset}. 373 * 374 * @param ascii 375 * The bytes to be converted to characters 376 * 377 * @param hibyte 378 * The top 8 bits of each 16-bit Unicode code unit 379 * 380 * @param offset 381 * The initial offset 382 * @param count 383 * The length 384 * 385 * @throws IndexOutOfBoundsException 386 * If {@code offset} is negative, {@code count} is negative, or 387 * {@code offset} is greater than {@code ascii.length - count} 388 * 389 * @see #String(byte[], int) 390 * @see #String(byte[], int, int, java.lang.String) 391 * @see #String(byte[], int, int, java.nio.charset.Charset) 392 * @see #String(byte[], int, int) 393 * @see #String(byte[], java.lang.String) 394 * @see #String(byte[], java.nio.charset.Charset) 395 * @see #String(byte[]) 396 */ 397 @Deprecated(since="1.1") 398 public String(byte[] ascii, int hibyte, int offset, int count) { 399 checkBoundsOffCount(offset, count, ascii.length); 400 if (count == 0) { 401 this.value = "".value; 402 this.coder = "".coder; 403 return; 404 } 405 if (COMPACT_STRINGS && (byte)hibyte == 0) { 406 this.value = Arrays.copyOfRange(ascii, offset, offset + count); 407 this.coder = LATIN1; 408 } else { 409 hibyte <<= 8; 410 byte[] val = StringUTF16.newBytesFor(count); 411 for (int i = 0; i < count; i++) { 412 StringUTF16.putChar(val, i, hibyte | (ascii[offset++] & 0xff)); 413 } 414 this.value = val; 415 this.coder = UTF16; 416 } 417 } 418 419 /** 420 * Allocates a new {@code String} containing characters constructed from 421 * an array of 8-bit integer values. Each character <i>c</i> in the 422 * resulting string is constructed from the corresponding component 423 * <i>b</i> in the byte array such that: 424 * 425 * <blockquote><pre> 426 * <b><i>c</i></b> == (char)(((hibyte & 0xff) << 8) 427 * | (<b><i>b</i></b> & 0xff)) 428 * </pre></blockquote> 429 * 430 * @deprecated This method does not properly convert bytes into 431 * characters. As of JDK 1.1, the preferred way to do this is via the 432 * {@code String} constructors that take a {@link Charset}, charset name, 433 * or that use the {@link Charset#defaultCharset() default charset}. 434 * 435 * @param ascii 436 * The bytes to be converted to characters 437 * 438 * @param hibyte 439 * The top 8 bits of each 16-bit Unicode code unit 440 * 441 * @see #String(byte[], int, int, java.lang.String) 442 * @see #String(byte[], int, int, java.nio.charset.Charset) 443 * @see #String(byte[], int, int) 444 * @see #String(byte[], java.lang.String) 445 * @see #String(byte[], java.nio.charset.Charset) 446 * @see #String(byte[]) 447 */ 448 @Deprecated(since="1.1") 449 public String(byte[] ascii, int hibyte) { 450 this(ascii, hibyte, 0, ascii.length); 451 } 452 453 /** 454 * Constructs a new {@code String} by decoding the specified subarray of 455 * bytes using the specified charset. The length of the new {@code String} 456 * is a function of the charset, and hence may not be equal to the length 457 * of the subarray. 458 * 459 * <p> The behavior of this constructor when the given bytes are not valid 460 * in the given charset is unspecified. The {@link 461 * java.nio.charset.CharsetDecoder} class should be used when more control 462 * over the decoding process is required. 463 * 464 * @param bytes 465 * The bytes to be decoded into characters 466 * 467 * @param offset 468 * The index of the first byte to decode 469 * 470 * @param length 471 * The number of bytes to decode 472 * 473 * @param charsetName 474 * The name of a supported {@linkplain java.nio.charset.Charset 475 * charset} 476 * 477 * @throws UnsupportedEncodingException 478 * If the named charset is not supported 479 * 480 * @throws IndexOutOfBoundsException 481 * If {@code offset} is negative, {@code length} is negative, or 482 * {@code offset} is greater than {@code bytes.length - length} 483 * 484 * @since 1.1 485 */ 486 public String(byte[] bytes, int offset, int length, String charsetName) 487 throws UnsupportedEncodingException { 488 this(lookupCharset(charsetName), bytes, checkBoundsOffCount(offset, length, bytes.length), length); 489 } 490 491 /** 492 * Constructs a new {@code String} by decoding the specified subarray of 493 * bytes using the specified {@linkplain java.nio.charset.Charset charset}. 494 * The length of the new {@code String} is a function of the charset, and 495 * hence may not be equal to the length of the subarray. 496 * 497 * <p> This method always replaces malformed-input and unmappable-character 498 * sequences with this charset's default replacement string. The {@link 499 * java.nio.charset.CharsetDecoder} class should be used when more control 500 * over the decoding process is required. 501 * 502 * @param bytes 503 * The bytes to be decoded into characters 504 * 505 * @param offset 506 * The index of the first byte to decode 507 * 508 * @param length 509 * The number of bytes to decode 510 * 511 * @param charset 512 * The {@linkplain java.nio.charset.Charset charset} to be used to 513 * decode the {@code bytes} 514 * 515 * @throws IndexOutOfBoundsException 516 * If {@code offset} is negative, {@code length} is negative, or 517 * {@code offset} is greater than {@code bytes.length - length} 518 * 519 * @since 1.6 520 */ 521 public String(byte[] bytes, int offset, int length, Charset charset) { 522 this(Objects.requireNonNull(charset), bytes, checkBoundsOffCount(offset, length, bytes.length), length); 523 } 524 525 /** 526 * This method does not do any precondition checks on its arguments. 527 * <p> 528 * Important: parameter order of this method is deliberately changed in order to 529 * disambiguate it against other similar methods of this class. 530 */ 531 @SuppressWarnings("removal") 532 private String(Charset charset, byte[] bytes, int offset, int length) { 533 if (length == 0) { 534 this.value = "".value; 535 this.coder = "".coder; 536 } else if (charset == UTF_8.INSTANCE) { 537 if (COMPACT_STRINGS) { 538 int dp = StringCoding.countPositives(bytes, offset, length); 539 if (dp == length) { 540 this.value = Arrays.copyOfRange(bytes, offset, offset + length); 541 this.coder = LATIN1; 542 return; 543 } 544 int sl = offset + length; 545 byte[] dst = new byte[length]; 546 if (dp > 0) { 547 System.arraycopy(bytes, offset, dst, 0, dp); 548 offset += dp; 549 } 550 while (offset < sl) { 551 int b1 = bytes[offset++]; 552 if (b1 >= 0) { 553 dst[dp++] = (byte)b1; 554 continue; 555 } 556 if ((b1 & 0xfe) == 0xc2 && offset < sl) { // b1 either 0xc2 or 0xc3 557 int b2 = bytes[offset]; 558 if (b2 < -64) { // continuation bytes are always negative values in the range -128 to -65 559 dst[dp++] = (byte)decode2(b1, b2); 560 offset++; 561 continue; 562 } 563 } 564 // anything not a latin1, including the REPL 565 // we have to go with the utf16 566 offset--; 567 break; 568 } 569 if (offset == sl) { 570 if (dp != dst.length) { 571 dst = Arrays.copyOf(dst, dp); 572 } 573 this.value = dst; 574 this.coder = LATIN1; 575 return; 576 } 577 byte[] buf = new byte[length << 1]; 578 StringLatin1.inflate(dst, 0, buf, 0, dp); 579 dst = buf; 580 dp = decodeUTF8_UTF16(bytes, offset, sl, dst, dp, true); 581 if (dp != length) { 582 dst = Arrays.copyOf(dst, dp << 1); 583 } 584 this.value = dst; 585 this.coder = UTF16; 586 } else { // !COMPACT_STRINGS 587 byte[] dst = new byte[length << 1]; 588 int dp = decodeUTF8_UTF16(bytes, offset, offset + length, dst, 0, true); 589 if (dp != length) { 590 dst = Arrays.copyOf(dst, dp << 1); 591 } 592 this.value = dst; 593 this.coder = UTF16; 594 } 595 } else if (charset == ISO_8859_1.INSTANCE) { 596 if (COMPACT_STRINGS) { 597 this.value = Arrays.copyOfRange(bytes, offset, offset + length); 598 this.coder = LATIN1; 599 } else { 600 this.value = StringLatin1.inflate(bytes, offset, length); 601 this.coder = UTF16; 602 } 603 } else if (charset == US_ASCII.INSTANCE) { 604 if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) { 605 this.value = Arrays.copyOfRange(bytes, offset, offset + length); 606 this.coder = LATIN1; 607 } else { 608 byte[] dst = new byte[length << 1]; 609 int dp = 0; 610 while (dp < length) { 611 int b = bytes[offset++]; 612 StringUTF16.putChar(dst, dp++, (b >= 0) ? (char) b : REPL); 613 } 614 this.value = dst; 615 this.coder = UTF16; 616 } 617 } else { 618 // (1)We never cache the "external" cs, the only benefit of creating 619 // an additional StringDe/Encoder object to wrap it is to share the 620 // de/encode() method. These SD/E objects are short-lived, the young-gen 621 // gc should be able to take care of them well. But the best approach 622 // is still not to generate them if not really necessary. 623 // (2)The defensive copy of the input byte/char[] has a big performance 624 // impact, as well as the outgoing result byte/char[]. Need to do the 625 // optimization check of (sm==null && classLoader0==null) for both. 626 CharsetDecoder cd = charset.newDecoder(); 627 // ArrayDecoder fastpaths 628 if (cd instanceof ArrayDecoder ad) { 629 // ascii 630 if (ad.isASCIICompatible() && !StringCoding.hasNegatives(bytes, offset, length)) { 631 if (COMPACT_STRINGS) { 632 this.value = Arrays.copyOfRange(bytes, offset, offset + length); 633 this.coder = LATIN1; 634 return; 635 } 636 this.value = StringLatin1.inflate(bytes, offset, length); 637 this.coder = UTF16; 638 return; 639 } 640 641 // fastpath for always Latin1 decodable single byte 642 if (COMPACT_STRINGS && ad.isLatin1Decodable()) { 643 byte[] dst = new byte[length]; 644 ad.decodeToLatin1(bytes, offset, length, dst); 645 this.value = dst; 646 this.coder = LATIN1; 647 return; 648 } 649 650 int en = scale(length, cd.maxCharsPerByte()); 651 cd.onMalformedInput(CodingErrorAction.REPLACE) 652 .onUnmappableCharacter(CodingErrorAction.REPLACE); 653 char[] ca = new char[en]; 654 int clen = ad.decode(bytes, offset, length, ca); 655 if (COMPACT_STRINGS) { 656 byte[] bs = StringUTF16.compress(ca, 0, clen); 657 if (bs != null) { 658 value = bs; 659 coder = LATIN1; 660 return; 661 } 662 } 663 coder = UTF16; 664 value = StringUTF16.toBytes(ca, 0, clen); 665 return; 666 } 667 668 // decode using CharsetDecoder 669 int en = scale(length, cd.maxCharsPerByte()); 670 cd.onMalformedInput(CodingErrorAction.REPLACE) 671 .onUnmappableCharacter(CodingErrorAction.REPLACE); 672 char[] ca = new char[en]; 673 if (charset.getClass().getClassLoader0() != null && 674 System.getSecurityManager() != null) { 675 bytes = Arrays.copyOfRange(bytes, offset, offset + length); 676 offset = 0; 677 } 678 679 int caLen; 680 try { 681 caLen = decodeWithDecoder(cd, ca, bytes, offset, length); 682 } catch (CharacterCodingException x) { 683 // Substitution is enabled, so this shouldn't happen 684 throw new Error(x); 685 } 686 if (COMPACT_STRINGS) { 687 byte[] bs = StringUTF16.compress(ca, 0, caLen); 688 if (bs != null) { 689 value = bs; 690 coder = LATIN1; 691 return; 692 } 693 } 694 coder = UTF16; 695 value = StringUTF16.toBytes(ca, 0, caLen); 696 } 697 } 698 699 /* 700 * Throws iae, instead of replacing, if malformed or unmappable. 701 * 702 * @param noShare 703 * {@code true} if the resulting string MUST NOT share the byte array, 704 * {@code false} if the byte array can be exclusively used to construct 705 * the string and is not modified or used for any other purpose. 706 */ 707 static String newStringUTF8NoRepl(byte[] bytes, int offset, int length, boolean noShare) { 708 checkBoundsOffCount(offset, length, bytes.length); 709 if (length == 0) { 710 return ""; 711 } 712 int dp; 713 byte[] dst; 714 if (COMPACT_STRINGS) { 715 dp = StringCoding.countPositives(bytes, offset, length); 716 int sl = offset + length; 717 if (dp == length) { 718 if (noShare || length != bytes.length) { 719 return new String(Arrays.copyOfRange(bytes, offset, offset + length), LATIN1); 720 } else { 721 return new String(bytes, LATIN1); 722 } 723 } 724 dst = new byte[length]; 725 System.arraycopy(bytes, offset, dst, 0, dp); 726 offset += dp; 727 while (offset < sl) { 728 int b1 = bytes[offset++]; 729 if (b1 >= 0) { 730 dst[dp++] = (byte)b1; 731 continue; 732 } 733 if ((b1 & 0xfe) == 0xc2 && offset < sl) { // b1 either 0xc2 or 0xc3 734 int b2 = bytes[offset]; 735 if (b2 < -64) { // continuation bytes are always negative values in the range -128 to -65 736 dst[dp++] = (byte)decode2(b1, b2); 737 offset++; 738 continue; 739 } 740 } 741 // anything not a latin1, including the REPL 742 // we have to go with the utf16 743 offset--; 744 break; 745 } 746 if (offset == sl) { 747 if (dp != dst.length) { 748 dst = Arrays.copyOf(dst, dp); 749 } 750 return new String(dst, LATIN1); 751 } 752 if (dp == 0) { 753 dst = new byte[length << 1]; 754 } else { 755 byte[] buf = new byte[length << 1]; 756 StringLatin1.inflate(dst, 0, buf, 0, dp); 757 dst = buf; 758 } 759 dp = decodeUTF8_UTF16(bytes, offset, sl, dst, dp, false); 760 } else { // !COMPACT_STRINGS 761 dst = new byte[length << 1]; 762 dp = decodeUTF8_UTF16(bytes, offset, offset + length, dst, 0, false); 763 } 764 if (dp != length) { 765 dst = Arrays.copyOf(dst, dp << 1); 766 } 767 return new String(dst, UTF16); 768 } 769 770 static String newStringNoRepl(byte[] src, Charset cs) throws CharacterCodingException { 771 try { 772 return newStringNoRepl1(src, cs); 773 } catch (IllegalArgumentException e) { 774 //newStringNoRepl1 throws IAE with MalformedInputException or CCE as the cause 775 Throwable cause = e.getCause(); 776 if (cause instanceof MalformedInputException mie) { 777 throw mie; 778 } 779 throw (CharacterCodingException)cause; 780 } 781 } 782 783 @SuppressWarnings("removal") 784 private static String newStringNoRepl1(byte[] src, Charset cs) { 785 int len = src.length; 786 if (len == 0) { 787 return ""; 788 } 789 if (cs == UTF_8.INSTANCE) { 790 return newStringUTF8NoRepl(src, 0, src.length, false); 791 } 792 if (cs == ISO_8859_1.INSTANCE) { 793 if (COMPACT_STRINGS) 794 return new String(src, LATIN1); 795 return new String(StringLatin1.inflate(src, 0, src.length), UTF16); 796 } 797 if (cs == US_ASCII.INSTANCE) { 798 if (!StringCoding.hasNegatives(src, 0, src.length)) { 799 if (COMPACT_STRINGS) 800 return new String(src, LATIN1); 801 return new String(StringLatin1.inflate(src, 0, src.length), UTF16); 802 } else { 803 throwMalformed(src); 804 } 805 } 806 807 CharsetDecoder cd = cs.newDecoder(); 808 // ascii fastpath 809 if (cd instanceof ArrayDecoder ad && 810 ad.isASCIICompatible() && 811 !StringCoding.hasNegatives(src, 0, src.length)) { 812 if (COMPACT_STRINGS) 813 return new String(src, LATIN1); 814 return new String(src, 0, src.length, ISO_8859_1.INSTANCE); 815 } 816 int en = scale(len, cd.maxCharsPerByte()); 817 char[] ca = new char[en]; 818 if (cs.getClass().getClassLoader0() != null && 819 System.getSecurityManager() != null) { 820 src = Arrays.copyOf(src, len); 821 } 822 int caLen; 823 try { 824 caLen = decodeWithDecoder(cd, ca, src, 0, src.length); 825 } catch (CharacterCodingException x) { 826 // throw via IAE 827 throw new IllegalArgumentException(x); 828 } 829 if (COMPACT_STRINGS) { 830 byte[] bs = StringUTF16.compress(ca, 0, caLen); 831 if (bs != null) { 832 return new String(bs, LATIN1); 833 } 834 } 835 return new String(StringUTF16.toBytes(ca, 0, caLen), UTF16); 836 } 837 838 private static final char REPL = '\ufffd'; 839 840 // Trim the given byte array to the given length 841 @SuppressWarnings("removal") 842 private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) { 843 if (len == ba.length && (isTrusted || System.getSecurityManager() == null)) { 844 return ba; 845 } else { 846 return Arrays.copyOf(ba, len); 847 } 848 } 849 850 private static int scale(int len, float expansionFactor) { 851 // We need to perform double, not float, arithmetic; otherwise 852 // we lose low order bits when len is larger than 2**24. 853 return (int)(len * (double)expansionFactor); 854 } 855 856 private static Charset lookupCharset(String csn) throws UnsupportedEncodingException { 857 Objects.requireNonNull(csn); 858 try { 859 return Charset.forName(csn); 860 } catch (UnsupportedCharsetException | IllegalCharsetNameException x) { 861 throw new UnsupportedEncodingException(csn); 862 } 863 } 864 865 private static byte[] encode(Charset cs, byte coder, byte[] val) { 866 if (cs == UTF_8.INSTANCE) { 867 return encodeUTF8(coder, val, true); 868 } 869 if (cs == ISO_8859_1.INSTANCE) { 870 return encode8859_1(coder, val); 871 } 872 if (cs == US_ASCII.INSTANCE) { 873 return encodeASCII(coder, val); 874 } 875 return encodeWithEncoder(cs, coder, val, true); 876 } 877 878 private static byte[] encodeWithEncoder(Charset cs, byte coder, byte[] val, boolean doReplace) { 879 CharsetEncoder ce = cs.newEncoder(); 880 int len = val.length >> coder; // assume LATIN1=0/UTF16=1; 881 int en = scale(len, ce.maxBytesPerChar()); 882 // fastpath with ArrayEncoder implies `doReplace`. 883 if (doReplace && ce instanceof ArrayEncoder ae) { 884 // fastpath for ascii compatible 885 if (coder == LATIN1 && 886 ae.isASCIICompatible() && 887 !StringCoding.hasNegatives(val, 0, val.length)) { 888 return val.clone(); 889 } 890 byte[] ba = new byte[en]; 891 if (len == 0) { 892 return ba; 893 } 894 895 int blen = (coder == LATIN1) ? ae.encodeFromLatin1(val, 0, len, ba) 896 : ae.encodeFromUTF16(val, 0, len, ba); 897 if (blen != -1) { 898 return safeTrim(ba, blen, true); 899 } 900 } 901 902 byte[] ba = new byte[en]; 903 if (len == 0) { 904 return ba; 905 } 906 if (doReplace) { 907 ce.onMalformedInput(CodingErrorAction.REPLACE) 908 .onUnmappableCharacter(CodingErrorAction.REPLACE); 909 } 910 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) 911 : StringUTF16.toChars(val); 912 ByteBuffer bb = ByteBuffer.wrap(ba); 913 CharBuffer cb = CharBuffer.wrap(ca, 0, len); 914 try { 915 CoderResult cr = ce.encode(cb, bb, true); 916 if (!cr.isUnderflow()) 917 cr.throwException(); 918 cr = ce.flush(bb); 919 if (!cr.isUnderflow()) 920 cr.throwException(); 921 } catch (CharacterCodingException x) { 922 if (!doReplace) { 923 throw new IllegalArgumentException(x); 924 } else { 925 throw new Error(x); 926 } 927 } 928 return safeTrim(ba, bb.position(), cs.getClass().getClassLoader0() == null); 929 } 930 931 /* 932 * Throws iae, instead of replacing, if unmappable. 933 */ 934 static byte[] getBytesUTF8NoRepl(String s) { 935 return encodeUTF8(s.coder(), s.value(), false); 936 } 937 938 private static boolean isASCII(byte[] src) { 939 return !StringCoding.hasNegatives(src, 0, src.length); 940 } 941 942 /* 943 * Throws CCE, instead of replacing, if unmappable. 944 */ 945 static byte[] getBytesNoRepl(String s, Charset cs) throws CharacterCodingException { 946 try { 947 return getBytesNoRepl1(s, cs); 948 } catch (IllegalArgumentException e) { 949 //getBytesNoRepl1 throws IAE with UnmappableCharacterException or CCE as the cause 950 Throwable cause = e.getCause(); 951 if (cause instanceof UnmappableCharacterException) { 952 throw (UnmappableCharacterException)cause; 953 } 954 throw (CharacterCodingException)cause; 955 } 956 } 957 958 private static byte[] getBytesNoRepl1(String s, Charset cs) { 959 byte[] val = s.value(); 960 byte coder = s.coder(); 961 if (cs == UTF_8.INSTANCE) { 962 if (coder == LATIN1 && isASCII(val)) { 963 return val; 964 } 965 return encodeUTF8(coder, val, false); 966 } 967 if (cs == ISO_8859_1.INSTANCE) { 968 if (coder == LATIN1) { 969 return val; 970 } 971 return encode8859_1(coder, val, false); 972 } 973 if (cs == US_ASCII.INSTANCE) { 974 if (coder == LATIN1) { 975 if (isASCII(val)) { 976 return val; 977 } else { 978 throwUnmappable(val); 979 } 980 } 981 } 982 return encodeWithEncoder(cs, coder, val, false); 983 } 984 985 private static byte[] encodeASCII(byte coder, byte[] val) { 986 if (coder == LATIN1) { 987 int positives = StringCoding.countPositives(val, 0, val.length); 988 byte[] dst = val.clone(); 989 if (positives < dst.length) { 990 replaceNegatives(dst, positives); 991 } 992 return dst; 993 } 994 int len = val.length >> 1; 995 byte[] dst = new byte[len]; 996 int dp = 0; 997 for (int i = 0; i < len; i++) { 998 char c = StringUTF16.getChar(val, i); 999 if (c < 0x80) { 1000 dst[dp++] = (byte)c; 1001 continue; 1002 } 1003 if (Character.isHighSurrogate(c) && i + 1 < len && 1004 Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) { 1005 i++; 1006 } 1007 dst[dp++] = '?'; 1008 } 1009 if (len == dp) { 1010 return dst; 1011 } 1012 return Arrays.copyOf(dst, dp); 1013 } 1014 1015 private static void replaceNegatives(byte[] val, int fromIndex) { 1016 for (int i = fromIndex; i < val.length; i++) { 1017 if (val[i] < 0) { 1018 val[i] = '?'; 1019 } 1020 } 1021 } 1022 1023 private static byte[] encode8859_1(byte coder, byte[] val) { 1024 return encode8859_1(coder, val, true); 1025 } 1026 1027 private static byte[] encode8859_1(byte coder, byte[] val, boolean doReplace) { 1028 if (coder == LATIN1) { 1029 return val.clone(); 1030 } 1031 int len = val.length >> 1; 1032 byte[] dst = new byte[len]; 1033 int dp = 0; 1034 int sp = 0; 1035 int sl = len; 1036 while (sp < sl) { 1037 int ret = StringCoding.implEncodeISOArray(val, sp, dst, dp, len); 1038 sp = sp + ret; 1039 dp = dp + ret; 1040 if (ret != len) { 1041 if (!doReplace) { 1042 throwUnmappable(sp); 1043 } 1044 char c = StringUTF16.getChar(val, sp++); 1045 if (Character.isHighSurrogate(c) && sp < sl && 1046 Character.isLowSurrogate(StringUTF16.getChar(val, sp))) { 1047 sp++; 1048 } 1049 dst[dp++] = '?'; 1050 len = sl - sp; 1051 } 1052 } 1053 if (dp == dst.length) { 1054 return dst; 1055 } 1056 return Arrays.copyOf(dst, dp); 1057 } 1058 1059 //////////////////////////////// utf8 //////////////////////////////////// 1060 1061 /** 1062 * Decodes ASCII from the source byte array into the destination 1063 * char array. Used via JavaLangAccess from UTF_8 and other charset 1064 * decoders. 1065 * 1066 * @return the number of bytes successfully decoded, at most len 1067 */ 1068 /* package-private */ 1069 static int decodeASCII(byte[] sa, int sp, char[] da, int dp, int len) { 1070 int count = StringCoding.countPositives(sa, sp, len); 1071 while (count < len) { 1072 if (sa[sp + count] < 0) { 1073 break; 1074 } 1075 count++; 1076 } 1077 StringLatin1.inflate(sa, sp, da, dp, count); 1078 return count; 1079 } 1080 1081 private static boolean isNotContinuation(int b) { 1082 return (b & 0xc0) != 0x80; 1083 } 1084 1085 private static boolean isMalformed3(int b1, int b2, int b3) { 1086 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 1087 (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; 1088 } 1089 1090 private static boolean isMalformed3_2(int b1, int b2) { 1091 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 1092 (b2 & 0xc0) != 0x80; 1093 } 1094 1095 private static boolean isMalformed4(int b2, int b3, int b4) { 1096 return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || 1097 (b4 & 0xc0) != 0x80; 1098 } 1099 1100 private static boolean isMalformed4_2(int b1, int b2) { 1101 return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 1102 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 1103 (b2 & 0xc0) != 0x80; 1104 } 1105 1106 private static boolean isMalformed4_3(int b3) { 1107 return (b3 & 0xc0) != 0x80; 1108 } 1109 1110 private static char decode2(int b1, int b2) { 1111 return (char)(((b1 << 6) ^ b2) ^ 1112 (((byte) 0xC0 << 6) ^ 1113 ((byte) 0x80 << 0))); 1114 } 1115 1116 private static char decode3(int b1, int b2, int b3) { 1117 return (char)((b1 << 12) ^ 1118 (b2 << 6) ^ 1119 (b3 ^ 1120 (((byte) 0xE0 << 12) ^ 1121 ((byte) 0x80 << 6) ^ 1122 ((byte) 0x80 << 0)))); 1123 } 1124 1125 private static int decode4(int b1, int b2, int b3, int b4) { 1126 return ((b1 << 18) ^ 1127 (b2 << 12) ^ 1128 (b3 << 6) ^ 1129 (b4 ^ 1130 (((byte) 0xF0 << 18) ^ 1131 ((byte) 0x80 << 12) ^ 1132 ((byte) 0x80 << 6) ^ 1133 ((byte) 0x80 << 0)))); 1134 } 1135 1136 private static int decodeUTF8_UTF16(byte[] src, int sp, int sl, byte[] dst, int dp, boolean doReplace) { 1137 while (sp < sl) { 1138 int b1 = src[sp++]; 1139 if (b1 >= 0) { 1140 StringUTF16.putChar(dst, dp++, (char) b1); 1141 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { 1142 if (sp < sl) { 1143 int b2 = src[sp++]; 1144 if (isNotContinuation(b2)) { 1145 if (!doReplace) { 1146 throwMalformed(sp - 1, 1); 1147 } 1148 StringUTF16.putChar(dst, dp++, REPL); 1149 sp--; 1150 } else { 1151 StringUTF16.putChar(dst, dp++, decode2(b1, b2)); 1152 } 1153 continue; 1154 } 1155 if (!doReplace) { 1156 throwMalformed(sp, 1); // underflow() 1157 } 1158 StringUTF16.putChar(dst, dp++, REPL); 1159 break; 1160 } else if ((b1 >> 4) == -2) { 1161 if (sp + 1 < sl) { 1162 int b2 = src[sp++]; 1163 int b3 = src[sp++]; 1164 if (isMalformed3(b1, b2, b3)) { 1165 if (!doReplace) { 1166 throwMalformed(sp - 3, 3); 1167 } 1168 StringUTF16.putChar(dst, dp++, REPL); 1169 sp -= 3; 1170 sp += malformed3(src, sp); 1171 } else { 1172 char c = decode3(b1, b2, b3); 1173 if (Character.isSurrogate(c)) { 1174 if (!doReplace) { 1175 throwMalformed(sp - 3, 3); 1176 } 1177 StringUTF16.putChar(dst, dp++, REPL); 1178 } else { 1179 StringUTF16.putChar(dst, dp++, c); 1180 } 1181 } 1182 continue; 1183 } 1184 if (sp < sl && isMalformed3_2(b1, src[sp])) { 1185 if (!doReplace) { 1186 throwMalformed(sp - 1, 2); 1187 } 1188 StringUTF16.putChar(dst, dp++, REPL); 1189 continue; 1190 } 1191 if (!doReplace) { 1192 throwMalformed(sp, 1); 1193 } 1194 StringUTF16.putChar(dst, dp++, REPL); 1195 break; 1196 } else if ((b1 >> 3) == -2) { 1197 if (sp + 2 < sl) { 1198 int b2 = src[sp++]; 1199 int b3 = src[sp++]; 1200 int b4 = src[sp++]; 1201 int uc = decode4(b1, b2, b3, b4); 1202 if (isMalformed4(b2, b3, b4) || 1203 !Character.isSupplementaryCodePoint(uc)) { // shortest form check 1204 if (!doReplace) { 1205 throwMalformed(sp - 4, 4); 1206 } 1207 StringUTF16.putChar(dst, dp++, REPL); 1208 sp -= 4; 1209 sp += malformed4(src, sp); 1210 } else { 1211 StringUTF16.putChar(dst, dp++, Character.highSurrogate(uc)); 1212 StringUTF16.putChar(dst, dp++, Character.lowSurrogate(uc)); 1213 } 1214 continue; 1215 } 1216 b1 &= 0xff; 1217 if (b1 > 0xf4 || sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) { 1218 if (!doReplace) { 1219 throwMalformed(sp - 1, 1); // or 2 1220 } 1221 StringUTF16.putChar(dst, dp++, REPL); 1222 continue; 1223 } 1224 if (!doReplace) { 1225 throwMalformed(sp - 1, 1); 1226 } 1227 sp++; 1228 StringUTF16.putChar(dst, dp++, REPL); 1229 if (sp < sl && isMalformed4_3(src[sp])) { 1230 continue; 1231 } 1232 break; 1233 } else { 1234 if (!doReplace) { 1235 throwMalformed(sp - 1, 1); 1236 } 1237 StringUTF16.putChar(dst, dp++, REPL); 1238 } 1239 } 1240 return dp; 1241 } 1242 1243 private static int decodeWithDecoder(CharsetDecoder cd, char[] dst, byte[] src, int offset, int length) 1244 throws CharacterCodingException { 1245 ByteBuffer bb = ByteBuffer.wrap(src, offset, length); 1246 CharBuffer cb = CharBuffer.wrap(dst, 0, dst.length); 1247 CoderResult cr = cd.decode(bb, cb, true); 1248 if (!cr.isUnderflow()) 1249 cr.throwException(); 1250 cr = cd.flush(cb); 1251 if (!cr.isUnderflow()) 1252 cr.throwException(); 1253 return cb.position(); 1254 } 1255 1256 private static int malformed3(byte[] src, int sp) { 1257 int b1 = src[sp++]; 1258 int b2 = src[sp]; // no need to lookup b3 1259 return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 1260 isNotContinuation(b2)) ? 1 : 2; 1261 } 1262 1263 private static int malformed4(byte[] src, int sp) { 1264 // we don't care the speed here 1265 int b1 = src[sp++] & 0xff; 1266 int b2 = src[sp++] & 0xff; 1267 if (b1 > 0xf4 || 1268 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 1269 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 1270 isNotContinuation(b2)) 1271 return 1; 1272 if (isNotContinuation(src[sp])) 1273 return 2; 1274 return 3; 1275 } 1276 1277 private static void throwMalformed(int off, int nb) { 1278 String msg = "malformed input off : " + off + ", length : " + nb; 1279 throw new IllegalArgumentException(msg, new MalformedInputException(nb)); 1280 } 1281 1282 private static void throwMalformed(byte[] val) { 1283 int dp = StringCoding.countPositives(val, 0, val.length); 1284 throwMalformed(dp, 1); 1285 } 1286 1287 private static void throwUnmappable(int off) { 1288 String msg = "malformed input off : " + off + ", length : 1"; 1289 throw new IllegalArgumentException(msg, new UnmappableCharacterException(1)); 1290 } 1291 1292 private static void throwUnmappable(byte[] val) { 1293 int dp = StringCoding.countPositives(val, 0, val.length); 1294 throwUnmappable(dp); 1295 } 1296 1297 private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) { 1298 if (coder == UTF16) { 1299 return encodeUTF8_UTF16(val, doReplace); 1300 } 1301 1302 if (!StringCoding.hasNegatives(val, 0, val.length)) { 1303 return val.clone(); 1304 } 1305 1306 int dp = 0; 1307 byte[] dst = new byte[val.length << 1]; 1308 for (byte c : val) { 1309 if (c < 0) { 1310 dst[dp++] = (byte) (0xc0 | ((c & 0xff) >> 6)); 1311 dst[dp++] = (byte) (0x80 | (c & 0x3f)); 1312 } else { 1313 dst[dp++] = c; 1314 } 1315 } 1316 if (dp == dst.length) { 1317 return dst; 1318 } 1319 return Arrays.copyOf(dst, dp); 1320 } 1321 1322 private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) { 1323 int dp = 0; 1324 int sp = 0; 1325 int sl = val.length >> 1; 1326 byte[] dst = new byte[sl * 3]; 1327 while (sp < sl) { 1328 // ascii fast loop; 1329 char c = StringUTF16.getChar(val, sp); 1330 if (c >= '\u0080') { 1331 break; 1332 } 1333 dst[dp++] = (byte)c; 1334 sp++; 1335 } 1336 while (sp < sl) { 1337 char c = StringUTF16.getChar(val, sp++); 1338 if (c < 0x80) { 1339 dst[dp++] = (byte)c; 1340 } else if (c < 0x800) { 1341 dst[dp++] = (byte)(0xc0 | (c >> 6)); 1342 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 1343 } else if (Character.isSurrogate(c)) { 1344 int uc = -1; 1345 char c2; 1346 if (Character.isHighSurrogate(c) && sp < sl && 1347 Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { 1348 uc = Character.toCodePoint(c, c2); 1349 } 1350 if (uc < 0) { 1351 if (doReplace) { 1352 dst[dp++] = '?'; 1353 } else { 1354 throwUnmappable(sp - 1); 1355 } 1356 } else { 1357 dst[dp++] = (byte)(0xf0 | ((uc >> 18))); 1358 dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); 1359 dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); 1360 dst[dp++] = (byte)(0x80 | (uc & 0x3f)); 1361 sp++; // 2 chars 1362 } 1363 } else { 1364 // 3 bytes, 16 bits 1365 dst[dp++] = (byte)(0xe0 | ((c >> 12))); 1366 dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); 1367 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 1368 } 1369 } 1370 if (dp == dst.length) { 1371 return dst; 1372 } 1373 return Arrays.copyOf(dst, dp); 1374 } 1375 1376 /** 1377 * Constructs a new {@code String} by decoding the specified array of bytes 1378 * using the specified {@linkplain java.nio.charset.Charset charset}. The 1379 * length of the new {@code String} is a function of the charset, and hence 1380 * may not be equal to the length of the byte array. 1381 * 1382 * <p> The behavior of this constructor when the given bytes are not valid 1383 * in the given charset is unspecified. The {@link 1384 * java.nio.charset.CharsetDecoder} class should be used when more control 1385 * over the decoding process is required. 1386 * 1387 * @param bytes 1388 * The bytes to be decoded into characters 1389 * 1390 * @param charsetName 1391 * The name of a supported {@linkplain java.nio.charset.Charset 1392 * charset} 1393 * 1394 * @throws UnsupportedEncodingException 1395 * If the named charset is not supported 1396 * 1397 * @since 1.1 1398 */ 1399 public String(byte[] bytes, String charsetName) 1400 throws UnsupportedEncodingException { 1401 this(lookupCharset(charsetName), bytes, 0, bytes.length); 1402 } 1403 1404 /** 1405 * Constructs a new {@code String} by decoding the specified array of 1406 * bytes using the specified {@linkplain java.nio.charset.Charset charset}. 1407 * The length of the new {@code String} is a function of the charset, and 1408 * hence may not be equal to the length of the byte array. 1409 * 1410 * <p> This method always replaces malformed-input and unmappable-character 1411 * sequences with this charset's default replacement string. The {@link 1412 * java.nio.charset.CharsetDecoder} class should be used when more control 1413 * over the decoding process is required. 1414 * 1415 * @param bytes 1416 * The bytes to be decoded into characters 1417 * 1418 * @param charset 1419 * The {@linkplain java.nio.charset.Charset charset} to be used to 1420 * decode the {@code bytes} 1421 * 1422 * @since 1.6 1423 */ 1424 public String(byte[] bytes, Charset charset) { 1425 this(Objects.requireNonNull(charset), bytes, 0, bytes.length); 1426 } 1427 1428 /** 1429 * Constructs a new {@code String} by decoding the specified subarray of 1430 * bytes using the {@link Charset#defaultCharset() default charset}. 1431 * The length of the new {@code String} is a function of the charset, 1432 * and hence may not be equal to the length of the subarray. 1433 * 1434 * <p> The behavior of this constructor when the given bytes are not valid 1435 * in the default charset is unspecified. The {@link 1436 * java.nio.charset.CharsetDecoder} class should be used when more control 1437 * over the decoding process is required. 1438 * 1439 * @param bytes 1440 * The bytes to be decoded into characters 1441 * 1442 * @param offset 1443 * The index of the first byte to decode 1444 * 1445 * @param length 1446 * The number of bytes to decode 1447 * 1448 * @throws IndexOutOfBoundsException 1449 * If {@code offset} is negative, {@code length} is negative, or 1450 * {@code offset} is greater than {@code bytes.length - length} 1451 * 1452 * @since 1.1 1453 */ 1454 public String(byte[] bytes, int offset, int length) { 1455 this(Charset.defaultCharset(), bytes, checkBoundsOffCount(offset, length, bytes.length), length); 1456 } 1457 1458 /** 1459 * Constructs a new {@code String} by decoding the specified array of bytes 1460 * using the {@link Charset#defaultCharset() default charset}. The length 1461 * of the new {@code String} is a function of the charset, and hence may not 1462 * be equal to the length of the byte array. 1463 * 1464 * <p> The behavior of this constructor when the given bytes are not valid 1465 * in the default charset is unspecified. The {@link 1466 * java.nio.charset.CharsetDecoder} class should be used when more control 1467 * over the decoding process is required. 1468 * 1469 * @param bytes 1470 * The bytes to be decoded into characters 1471 * 1472 * @since 1.1 1473 */ 1474 public String(byte[] bytes) { 1475 this(Charset.defaultCharset(), bytes, 0, bytes.length); 1476 } 1477 1478 /** 1479 * Allocates a new string that contains the sequence of characters 1480 * currently contained in the string buffer argument. The contents of the 1481 * string buffer are copied; subsequent modification of the string buffer 1482 * does not affect the newly created string. 1483 * 1484 * @param buffer 1485 * A {@code StringBuffer} 1486 */ 1487 public String(StringBuffer buffer) { 1488 this(buffer.toString()); 1489 } 1490 1491 /** 1492 * Allocates a new string that contains the sequence of characters 1493 * currently contained in the string builder argument. The contents of the 1494 * string builder are copied; subsequent modification of the string builder 1495 * does not affect the newly created string. 1496 * 1497 * <p> This constructor is provided to ease migration to {@code 1498 * StringBuilder}. Obtaining a string from a string builder via the {@code 1499 * toString} method is likely to run faster and is generally preferred. 1500 * 1501 * @param builder 1502 * A {@code StringBuilder} 1503 * 1504 * @since 1.5 1505 */ 1506 public String(StringBuilder builder) { 1507 this(builder, null); 1508 } 1509 1510 /** 1511 * Returns the length of this string. 1512 * The length is equal to the number of <a href="Character.html#unicode">Unicode 1513 * code units</a> in the string. 1514 * 1515 * @return the length of the sequence of characters represented by this 1516 * object. 1517 */ 1518 public int length() { 1519 return value.length >> coder(); 1520 } 1521 1522 /** 1523 * Returns {@code true} if, and only if, {@link #length()} is {@code 0}. 1524 * 1525 * @return {@code true} if {@link #length()} is {@code 0}, otherwise 1526 * {@code false} 1527 * 1528 * @since 1.6 1529 */ 1530 @Override 1531 public boolean isEmpty() { 1532 return value.length == 0; 1533 } 1534 1535 /** 1536 * Returns the {@code char} value at the 1537 * specified index. An index ranges from {@code 0} to 1538 * {@code length() - 1}. The first {@code char} value of the sequence 1539 * is at index {@code 0}, the next at index {@code 1}, 1540 * and so on, as for array indexing. 1541 * 1542 * <p>If the {@code char} value specified by the index is a 1543 * <a href="Character.html#unicode">surrogate</a>, the surrogate 1544 * value is returned. 1545 * 1546 * @param index the index of the {@code char} value. 1547 * @return the {@code char} value at the specified index of this string. 1548 * The first {@code char} value is at index {@code 0}. 1549 * @throws IndexOutOfBoundsException if the {@code index} 1550 * argument is negative or not less than the length of this 1551 * string. 1552 */ 1553 public char charAt(int index) { 1554 if (isLatin1()) { 1555 return StringLatin1.charAt(value, index); 1556 } else { 1557 return StringUTF16.charAt(value, index); 1558 } 1559 } 1560 1561 /** 1562 * Returns the character (Unicode code point) at the specified 1563 * index. The index refers to {@code char} values 1564 * (Unicode code units) and ranges from {@code 0} to 1565 * {@link #length()}{@code - 1}. 1566 * 1567 * <p> If the {@code char} value specified at the given index 1568 * is in the high-surrogate range, the following index is less 1569 * than the length of this {@code String}, and the 1570 * {@code char} value at the following index is in the 1571 * low-surrogate range, then the supplementary code point 1572 * corresponding to this surrogate pair is returned. Otherwise, 1573 * the {@code char} value at the given index is returned. 1574 * 1575 * @param index the index to the {@code char} values 1576 * @return the code point value of the character at the 1577 * {@code index} 1578 * @throws IndexOutOfBoundsException if the {@code index} 1579 * argument is negative or not less than the length of this 1580 * string. 1581 * @since 1.5 1582 */ 1583 public int codePointAt(int index) { 1584 if (isLatin1()) { 1585 checkIndex(index, value.length); 1586 return value[index] & 0xff; 1587 } 1588 int length = value.length >> 1; 1589 checkIndex(index, length); 1590 return StringUTF16.codePointAt(value, index, length); 1591 } 1592 1593 /** 1594 * Returns the character (Unicode code point) before the specified 1595 * index. The index refers to {@code char} values 1596 * (Unicode code units) and ranges from {@code 1} to {@link 1597 * CharSequence#length() length}. 1598 * 1599 * <p> If the {@code char} value at {@code (index - 1)} 1600 * is in the low-surrogate range, {@code (index - 2)} is not 1601 * negative, and the {@code char} value at {@code (index - 1602 * 2)} is in the high-surrogate range, then the 1603 * supplementary code point value of the surrogate pair is 1604 * returned. If the {@code char} value at {@code index - 1605 * 1} is an unpaired low-surrogate or a high-surrogate, the 1606 * surrogate value is returned. 1607 * 1608 * @param index the index following the code point that should be returned 1609 * @return the Unicode code point value before the given index. 1610 * @throws IndexOutOfBoundsException if the {@code index} 1611 * argument is less than 1 or greater than the length 1612 * of this string. 1613 * @since 1.5 1614 */ 1615 public int codePointBefore(int index) { 1616 int i = index - 1; 1617 checkIndex(i, length()); 1618 if (isLatin1()) { 1619 return (value[i] & 0xff); 1620 } 1621 return StringUTF16.codePointBefore(value, index); 1622 } 1623 1624 /** 1625 * Returns the number of Unicode code points in the specified text 1626 * range of this {@code String}. The text range begins at the 1627 * specified {@code beginIndex} and extends to the 1628 * {@code char} at index {@code endIndex - 1}. Thus the 1629 * length (in {@code char}s) of the text range is 1630 * {@code endIndex-beginIndex}. Unpaired surrogates within 1631 * the text range count as one code point each. 1632 * 1633 * @param beginIndex the index to the first {@code char} of 1634 * the text range. 1635 * @param endIndex the index after the last {@code char} of 1636 * the text range. 1637 * @return the number of Unicode code points in the specified text 1638 * range 1639 * @throws IndexOutOfBoundsException if the 1640 * {@code beginIndex} is negative, or {@code endIndex} 1641 * is larger than the length of this {@code String}, or 1642 * {@code beginIndex} is larger than {@code endIndex}. 1643 * @since 1.5 1644 */ 1645 public int codePointCount(int beginIndex, int endIndex) { 1646 Objects.checkFromToIndex(beginIndex, endIndex, length()); 1647 if (isLatin1()) { 1648 return endIndex - beginIndex; 1649 } 1650 return StringUTF16.codePointCount(value, beginIndex, endIndex); 1651 } 1652 1653 /** 1654 * Returns the index within this {@code String} that is 1655 * offset from the given {@code index} by 1656 * {@code codePointOffset} code points. Unpaired surrogates 1657 * within the text range given by {@code index} and 1658 * {@code codePointOffset} count as one code point each. 1659 * 1660 * @param index the index to be offset 1661 * @param codePointOffset the offset in code points 1662 * @return the index within this {@code String} 1663 * @throws IndexOutOfBoundsException if {@code index} 1664 * is negative or larger than the length of this 1665 * {@code String}, or if {@code codePointOffset} is positive 1666 * and the substring starting with {@code index} has fewer 1667 * than {@code codePointOffset} code points, 1668 * or if {@code codePointOffset} is negative and the substring 1669 * before {@code index} has fewer than the absolute value 1670 * of {@code codePointOffset} code points. 1671 * @since 1.5 1672 */ 1673 public int offsetByCodePoints(int index, int codePointOffset) { 1674 return Character.offsetByCodePoints(this, index, codePointOffset); 1675 } 1676 1677 /** 1678 * Copies characters from this string into the destination character 1679 * array. 1680 * <p> 1681 * The first character to be copied is at index {@code srcBegin}; 1682 * the last character to be copied is at index {@code srcEnd-1} 1683 * (thus the total number of characters to be copied is 1684 * {@code srcEnd-srcBegin}). The characters are copied into the 1685 * subarray of {@code dst} starting at index {@code dstBegin} 1686 * and ending at index: 1687 * <blockquote><pre> 1688 * dstBegin + (srcEnd-srcBegin) - 1 1689 * </pre></blockquote> 1690 * 1691 * @param srcBegin index of the first character in the string 1692 * to copy. 1693 * @param srcEnd index after the last character in the string 1694 * to copy. 1695 * @param dst the destination array. 1696 * @param dstBegin the start offset in the destination array. 1697 * @throws IndexOutOfBoundsException If any of the following 1698 * is true: 1699 * <ul><li>{@code srcBegin} is negative. 1700 * <li>{@code srcBegin} is greater than {@code srcEnd} 1701 * <li>{@code srcEnd} is greater than the length of this 1702 * string 1703 * <li>{@code dstBegin} is negative 1704 * <li>{@code dstBegin+(srcEnd-srcBegin)} is larger than 1705 * {@code dst.length}</ul> 1706 */ 1707 public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin) { 1708 checkBoundsBeginEnd(srcBegin, srcEnd, length()); 1709 checkBoundsOffCount(dstBegin, srcEnd - srcBegin, dst.length); 1710 if (isLatin1()) { 1711 StringLatin1.getChars(value, srcBegin, srcEnd, dst, dstBegin); 1712 } else { 1713 StringUTF16.getChars(value, srcBegin, srcEnd, dst, dstBegin); 1714 } 1715 } 1716 1717 /** 1718 * Copies characters from this string into the destination byte array. Each 1719 * byte receives the 8 low-order bits of the corresponding character. The 1720 * eight high-order bits of each character are not copied and do not 1721 * participate in the transfer in any way. 1722 * 1723 * <p> The first character to be copied is at index {@code srcBegin}; the 1724 * last character to be copied is at index {@code srcEnd-1}. The total 1725 * number of characters to be copied is {@code srcEnd-srcBegin}. The 1726 * characters, converted to bytes, are copied into the subarray of {@code 1727 * dst} starting at index {@code dstBegin} and ending at index: 1728 * 1729 * <blockquote><pre> 1730 * dstBegin + (srcEnd-srcBegin) - 1 1731 * </pre></blockquote> 1732 * 1733 * @deprecated This method does not properly convert characters into 1734 * bytes. As of JDK 1.1, the preferred way to do this is via the 1735 * {@link #getBytes()} method, which uses the {@link Charset#defaultCharset() 1736 * default charset}. 1737 * 1738 * @param srcBegin 1739 * Index of the first character in the string to copy 1740 * 1741 * @param srcEnd 1742 * Index after the last character in the string to copy 1743 * 1744 * @param dst 1745 * The destination array 1746 * 1747 * @param dstBegin 1748 * The start offset in the destination array 1749 * 1750 * @throws IndexOutOfBoundsException 1751 * If any of the following is true: 1752 * <ul> 1753 * <li> {@code srcBegin} is negative 1754 * <li> {@code srcBegin} is greater than {@code srcEnd} 1755 * <li> {@code srcEnd} is greater than the length of this String 1756 * <li> {@code dstBegin} is negative 1757 * <li> {@code dstBegin+(srcEnd-srcBegin)} is larger than {@code 1758 * dst.length} 1759 * </ul> 1760 */ 1761 @Deprecated(since="1.1") 1762 public void getBytes(int srcBegin, int srcEnd, byte[] dst, int dstBegin) { 1763 checkBoundsBeginEnd(srcBegin, srcEnd, length()); 1764 Objects.requireNonNull(dst); 1765 checkBoundsOffCount(dstBegin, srcEnd - srcBegin, dst.length); 1766 if (isLatin1()) { 1767 StringLatin1.getBytes(value, srcBegin, srcEnd, dst, dstBegin); 1768 } else { 1769 StringUTF16.getBytes(value, srcBegin, srcEnd, dst, dstBegin); 1770 } 1771 } 1772 1773 /** 1774 * Encodes this {@code String} into a sequence of bytes using the named 1775 * charset, storing the result into a new byte array. 1776 * 1777 * <p> The behavior of this method when this string cannot be encoded in 1778 * the given charset is unspecified. The {@link 1779 * java.nio.charset.CharsetEncoder} class should be used when more control 1780 * over the encoding process is required. 1781 * 1782 * @param charsetName 1783 * The name of a supported {@linkplain java.nio.charset.Charset 1784 * charset} 1785 * 1786 * @return The resultant byte array 1787 * 1788 * @throws UnsupportedEncodingException 1789 * If the named charset is not supported 1790 * 1791 * @since 1.1 1792 */ 1793 public byte[] getBytes(String charsetName) 1794 throws UnsupportedEncodingException { 1795 return encode(lookupCharset(charsetName), coder(), value); 1796 } 1797 1798 /** 1799 * Encodes this {@code String} into a sequence of bytes using the given 1800 * {@linkplain java.nio.charset.Charset charset}, storing the result into a 1801 * new byte array. 1802 * 1803 * <p> This method always replaces malformed-input and unmappable-character 1804 * sequences with this charset's default replacement byte array. The 1805 * {@link java.nio.charset.CharsetEncoder} class should be used when more 1806 * control over the encoding process is required. 1807 * 1808 * @param charset 1809 * The {@linkplain java.nio.charset.Charset} to be used to encode 1810 * the {@code String} 1811 * 1812 * @return The resultant byte array 1813 * 1814 * @since 1.6 1815 */ 1816 public byte[] getBytes(Charset charset) { 1817 if (charset == null) throw new NullPointerException(); 1818 return encode(charset, coder(), value); 1819 } 1820 1821 /** 1822 * Encodes this {@code String} into a sequence of bytes using the 1823 * {@link Charset#defaultCharset() default charset}, storing the result 1824 * into a new byte array. 1825 * 1826 * <p> The behavior of this method when this string cannot be encoded in 1827 * the default charset is unspecified. The {@link 1828 * java.nio.charset.CharsetEncoder} class should be used when more control 1829 * over the encoding process is required. 1830 * 1831 * @return The resultant byte array 1832 * 1833 * @since 1.1 1834 */ 1835 public byte[] getBytes() { 1836 return encode(Charset.defaultCharset(), coder(), value); 1837 } 1838 1839 /** 1840 * Compares this string to the specified object. The result is {@code 1841 * true} if and only if the argument is not {@code null} and is a {@code 1842 * String} object that represents the same sequence of characters as this 1843 * object. 1844 * 1845 * <p>For finer-grained String comparison, refer to 1846 * {@link java.text.Collator}. 1847 * 1848 * @param anObject 1849 * The object to compare this {@code String} against 1850 * 1851 * @return {@code true} if the given object represents a {@code String} 1852 * equivalent to this string, {@code false} otherwise 1853 * 1854 * @see #compareTo(String) 1855 * @see #equalsIgnoreCase(String) 1856 */ 1857 public boolean equals(Object anObject) { 1858 if (this == anObject) { 1859 return true; 1860 } 1861 return (anObject instanceof String aString) 1862 && (!COMPACT_STRINGS || this.coder == aString.coder) 1863 && StringLatin1.equals(value, aString.value); 1864 } 1865 1866 /** 1867 * Compares this string to the specified {@code StringBuffer}. The result 1868 * is {@code true} if and only if this {@code String} represents the same 1869 * sequence of characters as the specified {@code StringBuffer}. This method 1870 * synchronizes on the {@code StringBuffer}. 1871 * 1872 * <p>For finer-grained String comparison, refer to 1873 * {@link java.text.Collator}. 1874 * 1875 * @param sb 1876 * The {@code StringBuffer} to compare this {@code String} against 1877 * 1878 * @return {@code true} if this {@code String} represents the same 1879 * sequence of characters as the specified {@code StringBuffer}, 1880 * {@code false} otherwise 1881 * 1882 * @since 1.4 1883 */ 1884 public boolean contentEquals(StringBuffer sb) { 1885 return contentEquals((CharSequence)sb); 1886 } 1887 1888 private boolean nonSyncContentEquals(AbstractStringBuilder sb) { 1889 int len = length(); 1890 if (len != sb.length()) { 1891 return false; 1892 } 1893 byte[] v1 = value; 1894 byte[] v2 = sb.getValue(); 1895 byte coder = coder(); 1896 if (coder == sb.getCoder()) { 1897 return v1.length <= v2.length && ArraysSupport.mismatch(v1, v2, v1.length) < 0; 1898 } else { 1899 if (coder != LATIN1) { // utf16 str and latin1 abs can never be "equal" 1900 return false; 1901 } 1902 return StringUTF16.contentEquals(v1, v2, len); 1903 } 1904 } 1905 1906 /** 1907 * Compares this string to the specified {@code CharSequence}. The 1908 * result is {@code true} if and only if this {@code String} represents the 1909 * same sequence of char values as the specified sequence. Note that if the 1910 * {@code CharSequence} is a {@code StringBuffer} then the method 1911 * synchronizes on it. 1912 * 1913 * <p>For finer-grained String comparison, refer to 1914 * {@link java.text.Collator}. 1915 * 1916 * @param cs 1917 * The sequence to compare this {@code String} against 1918 * 1919 * @return {@code true} if this {@code String} represents the same 1920 * sequence of char values as the specified sequence, {@code 1921 * false} otherwise 1922 * 1923 * @since 1.5 1924 */ 1925 public boolean contentEquals(CharSequence cs) { 1926 // Argument is a StringBuffer, StringBuilder 1927 if (cs instanceof AbstractStringBuilder) { 1928 if (cs instanceof StringBuffer) { 1929 synchronized(cs) { 1930 return nonSyncContentEquals((AbstractStringBuilder)cs); 1931 } 1932 } else { 1933 return nonSyncContentEquals((AbstractStringBuilder)cs); 1934 } 1935 } 1936 // Argument is a String 1937 if (cs instanceof String) { 1938 return equals(cs); 1939 } 1940 // Argument is a generic CharSequence 1941 int n = cs.length(); 1942 if (n != length()) { 1943 return false; 1944 } 1945 byte[] val = this.value; 1946 if (isLatin1()) { 1947 for (int i = 0; i < n; i++) { 1948 if ((val[i] & 0xff) != cs.charAt(i)) { 1949 return false; 1950 } 1951 } 1952 } else { 1953 if (!StringUTF16.contentEquals(val, cs, n)) { 1954 return false; 1955 } 1956 } 1957 return true; 1958 } 1959 1960 /** 1961 * Compares this {@code String} to another {@code String}, ignoring case 1962 * considerations. Two strings are considered equal ignoring case if they 1963 * are of the same length and corresponding Unicode code points in the two 1964 * strings are equal ignoring case. 1965 * 1966 * <p> Two Unicode code points are considered the same 1967 * ignoring case if at least one of the following is true: 1968 * <ul> 1969 * <li> The two Unicode code points are the same (as compared by the 1970 * {@code ==} operator) 1971 * <li> Calling {@code Character.toLowerCase(Character.toUpperCase(int))} 1972 * on each Unicode code point produces the same result 1973 * </ul> 1974 * 1975 * <p>Note that this method does <em>not</em> take locale into account, and 1976 * will result in unsatisfactory results for certain locales. The 1977 * {@link java.text.Collator} class provides locale-sensitive comparison. 1978 * 1979 * @param anotherString 1980 * The {@code String} to compare this {@code String} against 1981 * 1982 * @return {@code true} if the argument is not {@code null} and it 1983 * represents an equivalent {@code String} ignoring case; {@code 1984 * false} otherwise 1985 * 1986 * @see #equals(Object) 1987 * @see #codePoints() 1988 */ 1989 public boolean equalsIgnoreCase(String anotherString) { 1990 return (this == anotherString) ? true 1991 : (anotherString != null) 1992 && (anotherString.length() == length()) 1993 && regionMatches(true, 0, anotherString, 0, length()); 1994 } 1995 1996 /** 1997 * Compares two strings lexicographically. 1998 * The comparison is based on the Unicode value of each character in 1999 * the strings. The character sequence represented by this 2000 * {@code String} object is compared lexicographically to the 2001 * character sequence represented by the argument string. The result is 2002 * a negative integer if this {@code String} object 2003 * lexicographically precedes the argument string. The result is a 2004 * positive integer if this {@code String} object lexicographically 2005 * follows the argument string. The result is zero if the strings 2006 * are equal; {@code compareTo} returns {@code 0} exactly when 2007 * the {@link #equals(Object)} method would return {@code true}. 2008 * <p> 2009 * This is the definition of lexicographic ordering. If two strings are 2010 * different, then either they have different characters at some index 2011 * that is a valid index for both strings, or their lengths are different, 2012 * or both. If they have different characters at one or more index 2013 * positions, let <i>k</i> be the smallest such index; then the string 2014 * whose character at position <i>k</i> has the smaller value, as 2015 * determined by using the {@code <} operator, lexicographically precedes the 2016 * other string. In this case, {@code compareTo} returns the 2017 * difference of the two character values at position {@code k} in 2018 * the two string -- that is, the value: 2019 * <blockquote><pre> 2020 * this.charAt(k)-anotherString.charAt(k) 2021 * </pre></blockquote> 2022 * If there is no index position at which they differ, then the shorter 2023 * string lexicographically precedes the longer string. In this case, 2024 * {@code compareTo} returns the difference of the lengths of the 2025 * strings -- that is, the value: 2026 * <blockquote><pre> 2027 * this.length()-anotherString.length() 2028 * </pre></blockquote> 2029 * 2030 * <p>For finer-grained String comparison, refer to 2031 * {@link java.text.Collator}. 2032 * 2033 * @param anotherString the {@code String} to be compared. 2034 * @return the value {@code 0} if the argument string is equal to 2035 * this string; a value less than {@code 0} if this string 2036 * is lexicographically less than the string argument; and a 2037 * value greater than {@code 0} if this string is 2038 * lexicographically greater than the string argument. 2039 */ 2040 public int compareTo(String anotherString) { 2041 byte[] v1 = value; 2042 byte[] v2 = anotherString.value; 2043 byte coder = coder(); 2044 if (coder == anotherString.coder()) { 2045 return coder == LATIN1 ? StringLatin1.compareTo(v1, v2) 2046 : StringUTF16.compareTo(v1, v2); 2047 } 2048 return coder == LATIN1 ? StringLatin1.compareToUTF16(v1, v2) 2049 : StringUTF16.compareToLatin1(v1, v2); 2050 } 2051 2052 /** 2053 * A Comparator that orders {@code String} objects as by 2054 * {@link #compareToIgnoreCase(String) compareToIgnoreCase}. 2055 * This comparator is serializable. 2056 * <p> 2057 * Note that this Comparator does <em>not</em> take locale into account, 2058 * and will result in an unsatisfactory ordering for certain locales. 2059 * The {@link java.text.Collator} class provides locale-sensitive comparison. 2060 * 2061 * @see java.text.Collator 2062 * @since 1.2 2063 */ 2064 public static final Comparator<String> CASE_INSENSITIVE_ORDER 2065 = new CaseInsensitiveComparator(); 2066 2067 /** 2068 * CaseInsensitiveComparator for Strings. 2069 */ 2070 private static class CaseInsensitiveComparator 2071 implements Comparator<String>, java.io.Serializable { 2072 // use serialVersionUID from JDK 1.2.2 for interoperability 2073 @java.io.Serial 2074 private static final long serialVersionUID = 8575799808933029326L; 2075 2076 public int compare(String s1, String s2) { 2077 byte[] v1 = s1.value; 2078 byte[] v2 = s2.value; 2079 byte coder = s1.coder(); 2080 if (coder == s2.coder()) { 2081 return coder == LATIN1 ? StringLatin1.compareToCI(v1, v2) 2082 : StringUTF16.compareToCI(v1, v2); 2083 } 2084 return coder == LATIN1 ? StringLatin1.compareToCI_UTF16(v1, v2) 2085 : StringUTF16.compareToCI_Latin1(v1, v2); 2086 } 2087 2088 /** Replaces the de-serialized object. */ 2089 @java.io.Serial 2090 private Object readResolve() { return CASE_INSENSITIVE_ORDER; } 2091 } 2092 2093 /** 2094 * Compares two strings lexicographically, ignoring case 2095 * differences. This method returns an integer whose sign is that of 2096 * calling {@code compareTo} with case folded versions of the strings 2097 * where case differences have been eliminated by calling 2098 * {@code Character.toLowerCase(Character.toUpperCase(int))} on 2099 * each Unicode code point. 2100 * <p> 2101 * Note that this method does <em>not</em> take locale into account, 2102 * and will result in an unsatisfactory ordering for certain locales. 2103 * The {@link java.text.Collator} class provides locale-sensitive comparison. 2104 * 2105 * @param str the {@code String} to be compared. 2106 * @return a negative integer, zero, or a positive integer as the 2107 * specified String is greater than, equal to, or less 2108 * than this String, ignoring case considerations. 2109 * @see java.text.Collator 2110 * @see #codePoints() 2111 * @since 1.2 2112 */ 2113 public int compareToIgnoreCase(String str) { 2114 return CASE_INSENSITIVE_ORDER.compare(this, str); 2115 } 2116 2117 /** 2118 * Tests if two string regions are equal. 2119 * <p> 2120 * A substring of this {@code String} object is compared to a substring 2121 * of the argument other. The result is true if these substrings 2122 * represent identical character sequences. The substring of this 2123 * {@code String} object to be compared begins at index {@code toffset} 2124 * and has length {@code len}. The substring of other to be compared 2125 * begins at index {@code ooffset} and has length {@code len}. The 2126 * result is {@code false} if and only if at least one of the following 2127 * is true: 2128 * <ul><li>{@code toffset} is negative. 2129 * <li>{@code ooffset} is negative. 2130 * <li>{@code toffset+len} is greater than the length of this 2131 * {@code String} object. 2132 * <li>{@code ooffset+len} is greater than the length of the other 2133 * argument. 2134 * <li>There is some nonnegative integer <i>k</i> less than {@code len} 2135 * such that: 2136 * {@code this.charAt(toffset + }<i>k</i>{@code ) != other.charAt(ooffset + } 2137 * <i>k</i>{@code )} 2138 * </ul> 2139 * 2140 * <p>Note that this method does <em>not</em> take locale into account. The 2141 * {@link java.text.Collator} class provides locale-sensitive comparison. 2142 * 2143 * @param toffset the starting offset of the subregion in this string. 2144 * @param other the string argument. 2145 * @param ooffset the starting offset of the subregion in the string 2146 * argument. 2147 * @param len the number of characters to compare. 2148 * @return {@code true} if the specified subregion of this string 2149 * exactly matches the specified subregion of the string argument; 2150 * {@code false} otherwise. 2151 */ 2152 public boolean regionMatches(int toffset, String other, int ooffset, int len) { 2153 // Note: toffset, ooffset, or len might be near -1>>>1. 2154 if ((ooffset < 0) || (toffset < 0) || 2155 (toffset > (long)length() - len) || 2156 (ooffset > (long)other.length() - len)) { 2157 return false; 2158 } 2159 byte[] tv = value; 2160 byte[] ov = other.value; 2161 byte coder = coder(); 2162 if (coder == other.coder()) { 2163 if (coder == UTF16) { 2164 toffset <<= UTF16; 2165 ooffset <<= UTF16; 2166 len <<= UTF16; 2167 } 2168 return ArraysSupport.mismatch(tv, toffset, 2169 ov, ooffset, len) < 0; 2170 } else { 2171 if (coder == LATIN1) { 2172 while (len-- > 0) { 2173 if (StringLatin1.getChar(tv, toffset++) != 2174 StringUTF16.getChar(ov, ooffset++)) { 2175 return false; 2176 } 2177 } 2178 } else { 2179 while (len-- > 0) { 2180 if (StringUTF16.getChar(tv, toffset++) != 2181 StringLatin1.getChar(ov, ooffset++)) { 2182 return false; 2183 } 2184 } 2185 } 2186 } 2187 return true; 2188 } 2189 2190 /** 2191 * Tests if two string regions are equal. 2192 * <p> 2193 * A substring of this {@code String} object is compared to a substring 2194 * of the argument {@code other}. The result is {@code true} if these 2195 * substrings represent Unicode code point sequences that are the same, 2196 * ignoring case if and only if {@code ignoreCase} is true. 2197 * The sequences {@code tsequence} and {@code osequence} are compared, 2198 * where {@code tsequence} is the sequence produced as if by calling 2199 * {@code this.substring(toffset, toffset + len).codePoints()} and 2200 * {@code osequence} is the sequence produced as if by calling 2201 * {@code other.substring(ooffset, ooffset + len).codePoints()}. 2202 * The result is {@code true} if and only if all of the following 2203 * are true: 2204 * <ul><li>{@code toffset} is non-negative. 2205 * <li>{@code ooffset} is non-negative. 2206 * <li>{@code toffset+len} is less than or equal to the length of this 2207 * {@code String} object. 2208 * <li>{@code ooffset+len} is less than or equal to the length of the other 2209 * argument. 2210 * <li>if {@code ignoreCase} is {@code false}, all pairs of corresponding Unicode 2211 * code points are equal integer values; or if {@code ignoreCase} is {@code true}, 2212 * {@link Character#toLowerCase(int) Character.toLowerCase(} 2213 * {@link Character#toUpperCase(int)}{@code )} on all pairs of Unicode code points 2214 * results in equal integer values. 2215 * </ul> 2216 * 2217 * <p>Note that this method does <em>not</em> take locale into account, 2218 * and will result in unsatisfactory results for certain locales when 2219 * {@code ignoreCase} is {@code true}. The {@link java.text.Collator} class 2220 * provides locale-sensitive comparison. 2221 * 2222 * @param ignoreCase if {@code true}, ignore case when comparing 2223 * characters. 2224 * @param toffset the starting offset of the subregion in this 2225 * string. 2226 * @param other the string argument. 2227 * @param ooffset the starting offset of the subregion in the string 2228 * argument. 2229 * @param len the number of characters (Unicode code units - 2230 * 16bit {@code char} value) to compare. 2231 * @return {@code true} if the specified subregion of this string 2232 * matches the specified subregion of the string argument; 2233 * {@code false} otherwise. Whether the matching is exact 2234 * or case insensitive depends on the {@code ignoreCase} 2235 * argument. 2236 * @see #codePoints() 2237 */ 2238 public boolean regionMatches(boolean ignoreCase, int toffset, 2239 String other, int ooffset, int len) { 2240 if (!ignoreCase) { 2241 return regionMatches(toffset, other, ooffset, len); 2242 } 2243 // Note: toffset, ooffset, or len might be near -1>>>1. 2244 if ((ooffset < 0) || (toffset < 0) 2245 || (toffset > (long)length() - len) 2246 || (ooffset > (long)other.length() - len)) { 2247 return false; 2248 } 2249 byte[] tv = value; 2250 byte[] ov = other.value; 2251 byte coder = coder(); 2252 if (coder == other.coder()) { 2253 return coder == LATIN1 2254 ? StringLatin1.regionMatchesCI(tv, toffset, ov, ooffset, len) 2255 : StringUTF16.regionMatchesCI(tv, toffset, ov, ooffset, len); 2256 } 2257 return coder == LATIN1 2258 ? StringLatin1.regionMatchesCI_UTF16(tv, toffset, ov, ooffset, len) 2259 : StringUTF16.regionMatchesCI_Latin1(tv, toffset, ov, ooffset, len); 2260 } 2261 2262 /** 2263 * Tests if the substring of this string beginning at the 2264 * specified index starts with the specified prefix. 2265 * 2266 * @param prefix the prefix. 2267 * @param toffset where to begin looking in this string. 2268 * @return {@code true} if the character sequence represented by the 2269 * argument is a prefix of the substring of this object starting 2270 * at index {@code toffset}; {@code false} otherwise. 2271 * The result is {@code false} if {@code toffset} is 2272 * negative or greater than the length of this 2273 * {@code String} object; otherwise the result is the same 2274 * as the result of the expression 2275 * <pre> 2276 * this.substring(toffset).startsWith(prefix) 2277 * </pre> 2278 */ 2279 public boolean startsWith(String prefix, int toffset) { 2280 // Note: toffset might be near -1>>>1. 2281 if (toffset < 0 || toffset > length() - prefix.length()) { 2282 return false; 2283 } 2284 byte[] ta = value; 2285 byte[] pa = prefix.value; 2286 int po = 0; 2287 int pc = pa.length; 2288 byte coder = coder(); 2289 if (coder == prefix.coder()) { 2290 if (coder == UTF16) { 2291 toffset <<= UTF16; 2292 } 2293 return ArraysSupport.mismatch(ta, toffset, 2294 pa, 0, pc) < 0; 2295 } else { 2296 if (coder == LATIN1) { // && pcoder == UTF16 2297 return false; 2298 } 2299 // coder == UTF16 && pcoder == LATIN1) 2300 while (po < pc) { 2301 if (StringUTF16.getChar(ta, toffset++) != (pa[po++] & 0xff)) { 2302 return false; 2303 } 2304 } 2305 } 2306 return true; 2307 } 2308 2309 /** 2310 * Tests if this string starts with the specified prefix. 2311 * 2312 * @param prefix the prefix. 2313 * @return {@code true} if the character sequence represented by the 2314 * argument is a prefix of the character sequence represented by 2315 * this string; {@code false} otherwise. 2316 * Note also that {@code true} will be returned if the 2317 * argument is an empty string or is equal to this 2318 * {@code String} object as determined by the 2319 * {@link #equals(Object)} method. 2320 * @since 1.0 2321 */ 2322 public boolean startsWith(String prefix) { 2323 return startsWith(prefix, 0); 2324 } 2325 2326 /** 2327 * Tests if this string ends with the specified suffix. 2328 * 2329 * @param suffix the suffix. 2330 * @return {@code true} if the character sequence represented by the 2331 * argument is a suffix of the character sequence represented by 2332 * this object; {@code false} otherwise. Note that the 2333 * result will be {@code true} if the argument is the 2334 * empty string or is equal to this {@code String} object 2335 * as determined by the {@link #equals(Object)} method. 2336 */ 2337 public boolean endsWith(String suffix) { 2338 return startsWith(suffix, length() - suffix.length()); 2339 } 2340 2341 /** 2342 * Returns a hash code for this string. The hash code for a 2343 * {@code String} object is computed as 2344 * <blockquote><pre> 2345 * s[0]*31^(n-1) + s[1]*31^(n-2) + ... + s[n-1] 2346 * </pre></blockquote> 2347 * using {@code int} arithmetic, where {@code s[i]} is the 2348 * <i>i</i>th character of the string, {@code n} is the length of 2349 * the string, and {@code ^} indicates exponentiation. 2350 * (The hash value of the empty string is zero.) 2351 * 2352 * @return a hash code value for this object. 2353 */ 2354 public int hashCode() { 2355 // The hash or hashIsZero fields are subject to a benign data race, 2356 // making it crucial to ensure that any observable result of the 2357 // calculation in this method stays correct under any possible read of 2358 // these fields. Necessary restrictions to allow this to be correct 2359 // without explicit memory fences or similar concurrency primitives is 2360 // that we can ever only write to one of these two fields for a given 2361 // String instance, and that the computation is idempotent and derived 2362 // from immutable state 2363 int h = hash; 2364 if (h == 0 && !hashIsZero) { 2365 h = isLatin1() ? StringLatin1.hashCode(value) 2366 : StringUTF16.hashCode(value); 2367 if (h == 0) { 2368 hashIsZero = true; 2369 } else { 2370 hash = h; 2371 } 2372 } 2373 return h; 2374 } 2375 2376 /** 2377 * Returns the index within this string of the first occurrence of 2378 * the specified character. If a character with value 2379 * {@code ch} occurs in the character sequence represented by 2380 * this {@code String} object, then the index (in Unicode 2381 * code units) of the first such occurrence is returned. For 2382 * values of {@code ch} in the range from 0 to 0xFFFF 2383 * (inclusive), this is the smallest value <i>k</i> such that: 2384 * <blockquote><pre> 2385 * this.charAt(<i>k</i>) == ch 2386 * </pre></blockquote> 2387 * is true. For other values of {@code ch}, it is the 2388 * smallest value <i>k</i> such that: 2389 * <blockquote><pre> 2390 * this.codePointAt(<i>k</i>) == ch 2391 * </pre></blockquote> 2392 * is true. In either case, if no such character occurs in this 2393 * string, then {@code -1} is returned. 2394 * 2395 * @param ch a character (Unicode code point). 2396 * @return the index of the first occurrence of the character in the 2397 * character sequence represented by this object, or 2398 * {@code -1} if the character does not occur. 2399 */ 2400 public int indexOf(int ch) { 2401 return indexOf(ch, 0); 2402 } 2403 2404 /** 2405 * Returns the index within this string of the first occurrence of the 2406 * specified character, starting the search at the specified index. 2407 * <p> 2408 * If a character with value {@code ch} occurs in the 2409 * character sequence represented by this {@code String} 2410 * object at an index no smaller than {@code fromIndex}, then 2411 * the index of the first such occurrence is returned. For values 2412 * of {@code ch} in the range from 0 to 0xFFFF (inclusive), 2413 * this is the smallest value <i>k</i> such that: 2414 * <blockquote><pre> 2415 * (this.charAt(<i>k</i>) == ch) {@code &&} (<i>k</i> >= fromIndex) 2416 * </pre></blockquote> 2417 * is true. For other values of {@code ch}, it is the 2418 * smallest value <i>k</i> such that: 2419 * <blockquote><pre> 2420 * (this.codePointAt(<i>k</i>) == ch) {@code &&} (<i>k</i> >= fromIndex) 2421 * </pre></blockquote> 2422 * is true. In either case, if no such character occurs in this 2423 * string at or after position {@code fromIndex}, then 2424 * {@code -1} is returned. 2425 * 2426 * <p> 2427 * There is no restriction on the value of {@code fromIndex}. If it 2428 * is negative, it has the same effect as if it were zero: this entire 2429 * string may be searched. If it is greater than the length of this 2430 * string, it has the same effect as if it were equal to the length of 2431 * this string: {@code -1} is returned. 2432 * 2433 * <p>All indices are specified in {@code char} values 2434 * (Unicode code units). 2435 * 2436 * @param ch a character (Unicode code point). 2437 * @param fromIndex the index to start the search from. 2438 * @return the index of the first occurrence of the character in the 2439 * character sequence represented by this object that is greater 2440 * than or equal to {@code fromIndex}, or {@code -1} 2441 * if the character does not occur. 2442 * 2443 * @apiNote 2444 * Unlike {@link #substring(int)}, for example, this method does not throw 2445 * an exception when {@code fromIndex} is outside the valid range. 2446 * Rather, it returns -1 when {@code fromIndex} is larger than the length of 2447 * the string. 2448 * This result is, by itself, indistinguishable from a genuine absence of 2449 * {@code ch} in the string. 2450 * If stricter behavior is needed, {@link #indexOf(int, int, int)} 2451 * should be considered instead. 2452 * On a {@link String} {@code s}, for example, 2453 * {@code s.indexOf(ch, fromIndex, s.length())} would throw if 2454 * {@code fromIndex} were larger than the string length, or were negative. 2455 */ 2456 public int indexOf(int ch, int fromIndex) { 2457 return isLatin1() ? StringLatin1.indexOf(value, ch, fromIndex, length()) 2458 : StringUTF16.indexOf(value, ch, fromIndex, length()); 2459 } 2460 2461 /** 2462 * Returns the index within this string of the first occurrence of the 2463 * specified character, starting the search at {@code beginIndex} and 2464 * stopping before {@code endIndex}. 2465 * 2466 * <p>If a character with value {@code ch} occurs in the 2467 * character sequence represented by this {@code String} 2468 * object at an index no smaller than {@code beginIndex} but smaller than 2469 * {@code endIndex}, then 2470 * the index of the first such occurrence is returned. For values 2471 * of {@code ch} in the range from 0 to 0xFFFF (inclusive), 2472 * this is the smallest value <i>k</i> such that: 2473 * <blockquote><pre> 2474 * (this.charAt(<i>k</i>) == ch) && (beginIndex <= <i>k</i> < endIndex) 2475 * </pre></blockquote> 2476 * is true. For other values of {@code ch}, it is the 2477 * smallest value <i>k</i> such that: 2478 * <blockquote><pre> 2479 * (this.codePointAt(<i>k</i>) == ch) && (beginIndex <= <i>k</i> < endIndex) 2480 * </pre></blockquote> 2481 * is true. In either case, if no such character occurs in this 2482 * string at or after position {@code beginIndex} and before position 2483 * {@code endIndex}, then {@code -1} is returned. 2484 * 2485 * <p>All indices are specified in {@code char} values 2486 * (Unicode code units). 2487 * 2488 * @param ch a character (Unicode code point). 2489 * @param beginIndex the index to start the search from (included). 2490 * @param endIndex the index to stop the search at (excluded). 2491 * @return the index of the first occurrence of the character in the 2492 * character sequence represented by this object that is greater 2493 * than or equal to {@code beginIndex} and less than {@code endIndex}, 2494 * or {@code -1} if the character does not occur. 2495 * @throws StringIndexOutOfBoundsException if {@code beginIndex} 2496 * is negative, or {@code endIndex} is larger than the length of 2497 * this {@code String} object, or {@code beginIndex} is larger than 2498 * {@code endIndex}. 2499 * @since 21 2500 */ 2501 public int indexOf(int ch, int beginIndex, int endIndex) { 2502 checkBoundsBeginEnd(beginIndex, endIndex, length()); 2503 return isLatin1() ? StringLatin1.indexOf(value, ch, beginIndex, endIndex) 2504 : StringUTF16.indexOf(value, ch, beginIndex, endIndex); 2505 } 2506 2507 /** 2508 * Returns the index within this string of the last occurrence of 2509 * the specified character. For values of {@code ch} in the 2510 * range from 0 to 0xFFFF (inclusive), the index (in Unicode code 2511 * units) returned is the largest value <i>k</i> such that: 2512 * <blockquote><pre> 2513 * this.charAt(<i>k</i>) == ch 2514 * </pre></blockquote> 2515 * is true. For other values of {@code ch}, it is the 2516 * largest value <i>k</i> such that: 2517 * <blockquote><pre> 2518 * this.codePointAt(<i>k</i>) == ch 2519 * </pre></blockquote> 2520 * is true. In either case, if no such character occurs in this 2521 * string, then {@code -1} is returned. The 2522 * {@code String} is searched backwards starting at the last 2523 * character. 2524 * 2525 * @param ch a character (Unicode code point). 2526 * @return the index of the last occurrence of the character in the 2527 * character sequence represented by this object, or 2528 * {@code -1} if the character does not occur. 2529 */ 2530 public int lastIndexOf(int ch) { 2531 return lastIndexOf(ch, length() - 1); 2532 } 2533 2534 /** 2535 * Returns the index within this string of the last occurrence of 2536 * the specified character, searching backward starting at the 2537 * specified index. For values of {@code ch} in the range 2538 * from 0 to 0xFFFF (inclusive), the index returned is the largest 2539 * value <i>k</i> such that: 2540 * <blockquote><pre> 2541 * (this.charAt(<i>k</i>) == ch) {@code &&} (<i>k</i> <= fromIndex) 2542 * </pre></blockquote> 2543 * is true. For other values of {@code ch}, it is the 2544 * largest value <i>k</i> such that: 2545 * <blockquote><pre> 2546 * (this.codePointAt(<i>k</i>) == ch) {@code &&} (<i>k</i> <= fromIndex) 2547 * </pre></blockquote> 2548 * is true. In either case, if no such character occurs in this 2549 * string at or before position {@code fromIndex}, then 2550 * {@code -1} is returned. 2551 * 2552 * <p>All indices are specified in {@code char} values 2553 * (Unicode code units). 2554 * 2555 * @param ch a character (Unicode code point). 2556 * @param fromIndex the index to start the search from. There is no 2557 * restriction on the value of {@code fromIndex}. If it is 2558 * greater than or equal to the length of this string, it has 2559 * the same effect as if it were equal to one less than the 2560 * length of this string: this entire string may be searched. 2561 * If it is negative, it has the same effect as if it were -1: 2562 * -1 is returned. 2563 * @return the index of the last occurrence of the character in the 2564 * character sequence represented by this object that is less 2565 * than or equal to {@code fromIndex}, or {@code -1} 2566 * if the character does not occur before that point. 2567 */ 2568 public int lastIndexOf(int ch, int fromIndex) { 2569 return isLatin1() ? StringLatin1.lastIndexOf(value, ch, fromIndex) 2570 : StringUTF16.lastIndexOf(value, ch, fromIndex); 2571 } 2572 2573 /** 2574 * Returns the index within this string of the first occurrence of the 2575 * specified substring. 2576 * 2577 * <p>The returned index is the smallest value {@code k} for which: 2578 * <pre>{@code 2579 * this.startsWith(str, k) 2580 * }</pre> 2581 * If no such value of {@code k} exists, then {@code -1} is returned. 2582 * 2583 * @param str the substring to search for. 2584 * @return the index of the first occurrence of the specified substring, 2585 * or {@code -1} if there is no such occurrence. 2586 */ 2587 public int indexOf(String str) { 2588 byte coder = coder(); 2589 if (coder == str.coder()) { 2590 return isLatin1() ? StringLatin1.indexOf(value, str.value) 2591 : StringUTF16.indexOf(value, str.value); 2592 } 2593 if (coder == LATIN1) { // str.coder == UTF16 2594 return -1; 2595 } 2596 return StringUTF16.indexOfLatin1(value, str.value); 2597 } 2598 2599 /** 2600 * Returns the index within this string of the first occurrence of the 2601 * specified substring, starting at the specified index. 2602 * 2603 * <p>The returned index is the smallest value {@code k} for which: 2604 * <pre>{@code 2605 * k >= Math.min(fromIndex, this.length()) && 2606 * this.startsWith(str, k) 2607 * }</pre> 2608 * If no such value of {@code k} exists, then {@code -1} is returned. 2609 * 2610 * @apiNote 2611 * Unlike {@link #substring(int)}, for example, this method does not throw 2612 * an exception when {@code fromIndex} is outside the valid range. 2613 * Rather, it returns -1 when {@code fromIndex} is larger than the length of 2614 * the string. 2615 * This result is, by itself, indistinguishable from a genuine absence of 2616 * {@code str} in the string. 2617 * If stricter behavior is needed, {@link #indexOf(String, int, int)} 2618 * should be considered instead. 2619 * On {@link String} {@code s} and a non-empty {@code str}, for example, 2620 * {@code s.indexOf(str, fromIndex, s.length())} would throw if 2621 * {@code fromIndex} were larger than the string length, or were negative. 2622 * 2623 * @param str the substring to search for. 2624 * @param fromIndex the index from which to start the search. 2625 * @return the index of the first occurrence of the specified substring, 2626 * starting at the specified index, 2627 * or {@code -1} if there is no such occurrence. 2628 */ 2629 public int indexOf(String str, int fromIndex) { 2630 return indexOf(value, coder(), length(), str, fromIndex); 2631 } 2632 2633 /** 2634 * Returns the index of the first occurrence of the specified substring 2635 * within the specified index range of {@code this} string. 2636 * 2637 * <p>This method returns the same result as the one of the invocation 2638 * <pre>{@code 2639 * s.substring(beginIndex, endIndex).indexOf(str) + beginIndex 2640 * }</pre> 2641 * if the index returned by {@link #indexOf(String)} is non-negative, 2642 * and returns -1 otherwise. 2643 * (No substring is instantiated, though.) 2644 * 2645 * @param str the substring to search for. 2646 * @param beginIndex the index to start the search from (included). 2647 * @param endIndex the index to stop the search at (excluded). 2648 * @return the index of the first occurrence of the specified substring 2649 * within the specified index range, 2650 * or {@code -1} if there is no such occurrence. 2651 * @throws StringIndexOutOfBoundsException if {@code beginIndex} 2652 * is negative, or {@code endIndex} is larger than the length of 2653 * this {@code String} object, or {@code beginIndex} is larger than 2654 * {@code endIndex}. 2655 * @since 21 2656 */ 2657 public int indexOf(String str, int beginIndex, int endIndex) { 2658 if (str.length() == 1) { 2659 /* Simple optimization, can be omitted without behavioral impact */ 2660 return indexOf(str.charAt(0), beginIndex, endIndex); 2661 } 2662 checkBoundsBeginEnd(beginIndex, endIndex, length()); 2663 return indexOf(value, coder(), endIndex, str, beginIndex); 2664 } 2665 2666 /** 2667 * Code shared by String and AbstractStringBuilder to do searches. The 2668 * source is the character array being searched, and the target 2669 * is the string being searched for. 2670 * 2671 * @param src the characters being searched. 2672 * @param srcCoder the coder of the source string. 2673 * @param srcCount last index (exclusive) in the source string. 2674 * @param tgtStr the characters being searched for. 2675 * @param fromIndex the index to begin searching from. 2676 */ 2677 static int indexOf(byte[] src, byte srcCoder, int srcCount, 2678 String tgtStr, int fromIndex) { 2679 fromIndex = Math.clamp(fromIndex, 0, srcCount); 2680 int tgtCount = tgtStr.length(); 2681 if (tgtCount > srcCount - fromIndex) { 2682 return -1; 2683 } 2684 if (tgtCount == 0) { 2685 return fromIndex; 2686 } 2687 2688 byte[] tgt = tgtStr.value; 2689 byte tgtCoder = tgtStr.coder(); 2690 if (srcCoder == tgtCoder) { 2691 return srcCoder == LATIN1 2692 ? StringLatin1.indexOf(src, srcCount, tgt, tgtCount, fromIndex) 2693 : StringUTF16.indexOf(src, srcCount, tgt, tgtCount, fromIndex); 2694 } 2695 if (srcCoder == LATIN1) { // && tgtCoder == UTF16 2696 return -1; 2697 } 2698 // srcCoder == UTF16 && tgtCoder == LATIN1) { 2699 return StringUTF16.indexOfLatin1(src, srcCount, tgt, tgtCount, fromIndex); 2700 } 2701 2702 /** 2703 * Returns the index within this string of the last occurrence of the 2704 * specified substring. The last occurrence of the empty string "" 2705 * is considered to occur at the index value {@code this.length()}. 2706 * 2707 * <p>The returned index is the largest value {@code k} for which: 2708 * <pre>{@code 2709 * this.startsWith(str, k) 2710 * }</pre> 2711 * If no such value of {@code k} exists, then {@code -1} is returned. 2712 * 2713 * @param str the substring to search for. 2714 * @return the index of the last occurrence of the specified substring, 2715 * or {@code -1} if there is no such occurrence. 2716 */ 2717 public int lastIndexOf(String str) { 2718 return lastIndexOf(str, length()); 2719 } 2720 2721 /** 2722 * Returns the index within this string of the last occurrence of the 2723 * specified substring, searching backward starting at the specified index. 2724 * 2725 * <p>The returned index is the largest value {@code k} for which: 2726 * <pre>{@code 2727 * k <= Math.min(fromIndex, this.length()) && 2728 * this.startsWith(str, k) 2729 * }</pre> 2730 * If no such value of {@code k} exists, then {@code -1} is returned. 2731 * 2732 * @param str the substring to search for. 2733 * @param fromIndex the index to start the search from. 2734 * @return the index of the last occurrence of the specified substring, 2735 * searching backward from the specified index, 2736 * or {@code -1} if there is no such occurrence. 2737 */ 2738 public int lastIndexOf(String str, int fromIndex) { 2739 return lastIndexOf(value, coder(), length(), str, fromIndex); 2740 } 2741 2742 /** 2743 * Code shared by String and AbstractStringBuilder to do searches. The 2744 * source is the character array being searched, and the target 2745 * is the string being searched for. 2746 * 2747 * @param src the characters being searched. 2748 * @param srcCoder coder handles the mapping between bytes/chars 2749 * @param srcCount count of the source string. 2750 * @param tgtStr the characters being searched for. 2751 * @param fromIndex the index to begin searching from. 2752 */ 2753 static int lastIndexOf(byte[] src, byte srcCoder, int srcCount, 2754 String tgtStr, int fromIndex) { 2755 byte[] tgt = tgtStr.value; 2756 byte tgtCoder = tgtStr.coder(); 2757 int tgtCount = tgtStr.length(); 2758 /* 2759 * Check arguments; return immediately where possible. For 2760 * consistency, don't check for null str. 2761 */ 2762 int rightIndex = srcCount - tgtCount; 2763 if (fromIndex > rightIndex) { 2764 fromIndex = rightIndex; 2765 } 2766 if (fromIndex < 0) { 2767 return -1; 2768 } 2769 /* Empty string always matches. */ 2770 if (tgtCount == 0) { 2771 return fromIndex; 2772 } 2773 if (srcCoder == tgtCoder) { 2774 return srcCoder == LATIN1 2775 ? StringLatin1.lastIndexOf(src, srcCount, tgt, tgtCount, fromIndex) 2776 : StringUTF16.lastIndexOf(src, srcCount, tgt, tgtCount, fromIndex); 2777 } 2778 if (srcCoder == LATIN1) { // && tgtCoder == UTF16 2779 return -1; 2780 } 2781 // srcCoder == UTF16 && tgtCoder == LATIN1 2782 return StringUTF16.lastIndexOfLatin1(src, srcCount, tgt, tgtCount, fromIndex); 2783 } 2784 2785 /** 2786 * Returns a string that is a substring of this string. The 2787 * substring begins with the character at the specified index and 2788 * extends to the end of this string. <p> 2789 * Examples: 2790 * <blockquote><pre> 2791 * "unhappy".substring(2) returns "happy" 2792 * "Harbison".substring(3) returns "bison" 2793 * "emptiness".substring(9) returns "" (an empty string) 2794 * </pre></blockquote> 2795 * 2796 * @param beginIndex the beginning index, inclusive. 2797 * @return the specified substring. 2798 * @throws IndexOutOfBoundsException if 2799 * {@code beginIndex} is negative or larger than the 2800 * length of this {@code String} object. 2801 */ 2802 public String substring(int beginIndex) { 2803 return substring(beginIndex, length()); 2804 } 2805 2806 /** 2807 * Returns a string that is a substring of this string. The 2808 * substring begins at the specified {@code beginIndex} and 2809 * extends to the character at index {@code endIndex - 1}. 2810 * Thus the length of the substring is {@code endIndex-beginIndex}. 2811 * <p> 2812 * Examples: 2813 * <blockquote><pre> 2814 * "hamburger".substring(4, 8) returns "urge" 2815 * "smiles".substring(1, 5) returns "mile" 2816 * </pre></blockquote> 2817 * 2818 * @param beginIndex the beginning index, inclusive. 2819 * @param endIndex the ending index, exclusive. 2820 * @return the specified substring. 2821 * @throws IndexOutOfBoundsException if the 2822 * {@code beginIndex} is negative, or 2823 * {@code endIndex} is larger than the length of 2824 * this {@code String} object, or 2825 * {@code beginIndex} is larger than 2826 * {@code endIndex}. 2827 */ 2828 public String substring(int beginIndex, int endIndex) { 2829 int length = length(); 2830 checkBoundsBeginEnd(beginIndex, endIndex, length); 2831 if (beginIndex == 0 && endIndex == length) { 2832 return this; 2833 } 2834 int subLen = endIndex - beginIndex; 2835 return isLatin1() ? StringLatin1.newString(value, beginIndex, subLen) 2836 : StringUTF16.newString(value, beginIndex, subLen); 2837 } 2838 2839 /** 2840 * Returns a character sequence that is a subsequence of this sequence. 2841 * 2842 * <p> An invocation of this method of the form 2843 * 2844 * <blockquote><pre> 2845 * str.subSequence(begin, end)</pre></blockquote> 2846 * 2847 * behaves in exactly the same way as the invocation 2848 * 2849 * <blockquote><pre> 2850 * str.substring(begin, end)</pre></blockquote> 2851 * 2852 * @apiNote 2853 * This method is defined so that the {@code String} class can implement 2854 * the {@link CharSequence} interface. 2855 * 2856 * @param beginIndex the begin index, inclusive. 2857 * @param endIndex the end index, exclusive. 2858 * @return the specified subsequence. 2859 * 2860 * @throws IndexOutOfBoundsException 2861 * if {@code beginIndex} or {@code endIndex} is negative, 2862 * if {@code endIndex} is greater than {@code length()}, 2863 * or if {@code beginIndex} is greater than {@code endIndex} 2864 * 2865 * @since 1.4 2866 */ 2867 public CharSequence subSequence(int beginIndex, int endIndex) { 2868 return this.substring(beginIndex, endIndex); 2869 } 2870 2871 /** 2872 * Concatenates the specified string to the end of this string. 2873 * <p> 2874 * If the length of the argument string is {@code 0}, then this 2875 * {@code String} object is returned. Otherwise, a 2876 * {@code String} object is returned that represents a character 2877 * sequence that is the concatenation of the character sequence 2878 * represented by this {@code String} object and the character 2879 * sequence represented by the argument string.<p> 2880 * Examples: 2881 * <blockquote><pre> 2882 * "cares".concat("s") returns "caress" 2883 * "to".concat("get").concat("her") returns "together" 2884 * </pre></blockquote> 2885 * 2886 * @param str the {@code String} that is concatenated to the end 2887 * of this {@code String}. 2888 * @return a string that represents the concatenation of this object's 2889 * characters followed by the string argument's characters. 2890 */ 2891 public String concat(String str) { 2892 if (str.isEmpty()) { 2893 return this; 2894 } 2895 return StringConcatHelper.simpleConcat(this, str); 2896 } 2897 2898 /** 2899 * Returns a string resulting from replacing all occurrences of 2900 * {@code oldChar} in this string with {@code newChar}. 2901 * <p> 2902 * If the character {@code oldChar} does not occur in the 2903 * character sequence represented by this {@code String} object, 2904 * then a reference to this {@code String} object is returned. 2905 * Otherwise, a {@code String} object is returned that 2906 * represents a character sequence identical to the character sequence 2907 * represented by this {@code String} object, except that every 2908 * occurrence of {@code oldChar} is replaced by an occurrence 2909 * of {@code newChar}. 2910 * <p> 2911 * Examples: 2912 * <blockquote><pre> 2913 * "mesquite in your cellar".replace('e', 'o') 2914 * returns "mosquito in your collar" 2915 * "the war of baronets".replace('r', 'y') 2916 * returns "the way of bayonets" 2917 * "sparring with a purple porpoise".replace('p', 't') 2918 * returns "starring with a turtle tortoise" 2919 * "JonL".replace('q', 'x') returns "JonL" (no change) 2920 * </pre></blockquote> 2921 * 2922 * @param oldChar the old character. 2923 * @param newChar the new character. 2924 * @return a string derived from this string by replacing every 2925 * occurrence of {@code oldChar} with {@code newChar}. 2926 */ 2927 public String replace(char oldChar, char newChar) { 2928 if (oldChar != newChar) { 2929 String ret = isLatin1() ? StringLatin1.replace(value, oldChar, newChar) 2930 : StringUTF16.replace(value, oldChar, newChar); 2931 if (ret != null) { 2932 return ret; 2933 } 2934 } 2935 return this; 2936 } 2937 2938 /** 2939 * Tells whether or not this string matches the given <a 2940 * href="../util/regex/Pattern.html#sum">regular expression</a>. 2941 * 2942 * <p> An invocation of this method of the form 2943 * <i>str</i>{@code .matches(}<i>regex</i>{@code )} yields exactly the 2944 * same result as the expression 2945 * 2946 * <blockquote> 2947 * {@link java.util.regex.Pattern}.{@link java.util.regex.Pattern#matches(String,CharSequence) 2948 * matches(<i>regex</i>, <i>str</i>)} 2949 * </blockquote> 2950 * 2951 * @param regex 2952 * the regular expression to which this string is to be matched 2953 * 2954 * @return {@code true} if, and only if, this string matches the 2955 * given regular expression 2956 * 2957 * @throws PatternSyntaxException 2958 * if the regular expression's syntax is invalid 2959 * 2960 * @see java.util.regex.Pattern 2961 * 2962 * @since 1.4 2963 */ 2964 public boolean matches(String regex) { 2965 return Pattern.matches(regex, this); 2966 } 2967 2968 /** 2969 * Returns true if and only if this string contains the specified 2970 * sequence of char values. 2971 * 2972 * @param s the sequence to search for 2973 * @return true if this string contains {@code s}, false otherwise 2974 * @since 1.5 2975 */ 2976 public boolean contains(CharSequence s) { 2977 return indexOf(s.toString()) >= 0; 2978 } 2979 2980 /** 2981 * Replaces the first substring of this string that matches the given <a 2982 * href="../util/regex/Pattern.html#sum">regular expression</a> with the 2983 * given replacement. 2984 * 2985 * <p> An invocation of this method of the form 2986 * <i>str</i>{@code .replaceFirst(}<i>regex</i>{@code ,} <i>repl</i>{@code )} 2987 * yields exactly the same result as the expression 2988 * 2989 * <blockquote> 2990 * <code> 2991 * {@link java.util.regex.Pattern}.{@link 2992 * java.util.regex.Pattern#compile(String) compile}(<i>regex</i>).{@link 2993 * java.util.regex.Pattern#matcher(java.lang.CharSequence) matcher}(<i>str</i>).{@link 2994 * java.util.regex.Matcher#replaceFirst(String) replaceFirst}(<i>repl</i>) 2995 * </code> 2996 * </blockquote> 2997 * 2998 *<p> 2999 * Note that backslashes ({@code \}) and dollar signs ({@code $}) in the 3000 * replacement string may cause the results to be different than if it were 3001 * being treated as a literal replacement string; see 3002 * {@link java.util.regex.Matcher#replaceFirst}. 3003 * Use {@link java.util.regex.Matcher#quoteReplacement} to suppress the special 3004 * meaning of these characters, if desired. 3005 * 3006 * @param regex 3007 * the regular expression to which this string is to be matched 3008 * @param replacement 3009 * the string to be substituted for the first match 3010 * 3011 * @return The resulting {@code String} 3012 * 3013 * @throws PatternSyntaxException 3014 * if the regular expression's syntax is invalid 3015 * 3016 * @see java.util.regex.Pattern 3017 * 3018 * @since 1.4 3019 */ 3020 public String replaceFirst(String regex, String replacement) { 3021 return Pattern.compile(regex).matcher(this).replaceFirst(replacement); 3022 } 3023 3024 /** 3025 * Replaces each substring of this string that matches the given <a 3026 * href="../util/regex/Pattern.html#sum">regular expression</a> with the 3027 * given replacement. 3028 * 3029 * <p> An invocation of this method of the form 3030 * <i>str</i>{@code .replaceAll(}<i>regex</i>{@code ,} <i>repl</i>{@code )} 3031 * yields exactly the same result as the expression 3032 * 3033 * <blockquote> 3034 * <code> 3035 * {@link java.util.regex.Pattern}.{@link 3036 * java.util.regex.Pattern#compile(String) compile}(<i>regex</i>).{@link 3037 * java.util.regex.Pattern#matcher(java.lang.CharSequence) matcher}(<i>str</i>).{@link 3038 * java.util.regex.Matcher#replaceAll(String) replaceAll}(<i>repl</i>) 3039 * </code> 3040 * </blockquote> 3041 * 3042 *<p> 3043 * Note that backslashes ({@code \}) and dollar signs ({@code $}) in the 3044 * replacement string may cause the results to be different than if it were 3045 * being treated as a literal replacement string; see 3046 * {@link java.util.regex.Matcher#replaceAll Matcher.replaceAll}. 3047 * Use {@link java.util.regex.Matcher#quoteReplacement} to suppress the special 3048 * meaning of these characters, if desired. 3049 * 3050 * @param regex 3051 * the regular expression to which this string is to be matched 3052 * @param replacement 3053 * the string to be substituted for each match 3054 * 3055 * @return The resulting {@code String} 3056 * 3057 * @throws PatternSyntaxException 3058 * if the regular expression's syntax is invalid 3059 * 3060 * @see java.util.regex.Pattern 3061 * 3062 * @since 1.4 3063 */ 3064 public String replaceAll(String regex, String replacement) { 3065 return Pattern.compile(regex).matcher(this).replaceAll(replacement); 3066 } 3067 3068 /** 3069 * Replaces each substring of this string that matches the literal target 3070 * sequence with the specified literal replacement sequence. The 3071 * replacement proceeds from the beginning of the string to the end, for 3072 * example, replacing "aa" with "b" in the string "aaa" will result in 3073 * "ba" rather than "ab". 3074 * 3075 * @param target The sequence of char values to be replaced 3076 * @param replacement The replacement sequence of char values 3077 * @return The resulting string 3078 * @since 1.5 3079 */ 3080 public String replace(CharSequence target, CharSequence replacement) { 3081 String trgtStr = target.toString(); 3082 String replStr = replacement.toString(); 3083 int thisLen = length(); 3084 int trgtLen = trgtStr.length(); 3085 int replLen = replStr.length(); 3086 3087 if (trgtLen > 0) { 3088 if (trgtLen == 1 && replLen == 1) { 3089 return replace(trgtStr.charAt(0), replStr.charAt(0)); 3090 } 3091 3092 boolean thisIsLatin1 = this.isLatin1(); 3093 boolean trgtIsLatin1 = trgtStr.isLatin1(); 3094 boolean replIsLatin1 = replStr.isLatin1(); 3095 String ret = (thisIsLatin1 && trgtIsLatin1 && replIsLatin1) 3096 ? StringLatin1.replace(value, thisLen, 3097 trgtStr.value, trgtLen, 3098 replStr.value, replLen) 3099 : StringUTF16.replace(value, thisLen, thisIsLatin1, 3100 trgtStr.value, trgtLen, trgtIsLatin1, 3101 replStr.value, replLen, replIsLatin1); 3102 if (ret != null) { 3103 return ret; 3104 } 3105 return this; 3106 3107 } else { // trgtLen == 0 3108 int resultLen; 3109 try { 3110 resultLen = Math.addExact(thisLen, Math.multiplyExact( 3111 Math.addExact(thisLen, 1), replLen)); 3112 } catch (ArithmeticException ignored) { 3113 throw new OutOfMemoryError("Required length exceeds implementation limit"); 3114 } 3115 3116 StringBuilder sb = new StringBuilder(resultLen); 3117 sb.append(replStr); 3118 for (int i = 0; i < thisLen; ++i) { 3119 sb.append(charAt(i)).append(replStr); 3120 } 3121 return sb.toString(); 3122 } 3123 } 3124 3125 /** 3126 * Splits this string around matches of the given 3127 * <a href="../util/regex/Pattern.html#sum">regular expression</a>. 3128 * 3129 * <p> The array returned by this method contains each substring of this 3130 * string that is terminated by another substring that matches the given 3131 * expression or is terminated by the end of the string. The substrings in 3132 * the array are in the order in which they occur in this string. If the 3133 * expression does not match any part of the input then the resulting array 3134 * has just one element, namely this string. 3135 * 3136 * <p> When there is a positive-width match at the beginning of this 3137 * string then an empty leading substring is included at the beginning 3138 * of the resulting array. A zero-width match at the beginning however 3139 * never produces such empty leading substring. 3140 * 3141 * <p> The {@code limit} parameter controls the number of times the 3142 * pattern is applied and therefore affects the length of the resulting 3143 * array. 3144 * <ul> 3145 * <li><p> 3146 * If the <i>limit</i> is positive then the pattern will be applied 3147 * at most <i>limit</i> - 1 times, the array's length will be 3148 * no greater than <i>limit</i>, and the array's last entry will contain 3149 * all input beyond the last matched delimiter.</p></li> 3150 * 3151 * <li><p> 3152 * If the <i>limit</i> is zero then the pattern will be applied as 3153 * many times as possible, the array can have any length, and trailing 3154 * empty strings will be discarded.</p></li> 3155 * 3156 * <li><p> 3157 * If the <i>limit</i> is negative then the pattern will be applied 3158 * as many times as possible and the array can have any length.</p></li> 3159 * </ul> 3160 * 3161 * <p> The string {@code "boo:and:foo"}, for example, yields the 3162 * following results with these parameters: 3163 * 3164 * <blockquote><table class="plain"> 3165 * <caption style="display:none">Split example showing regex, limit, and result</caption> 3166 * <thead> 3167 * <tr> 3168 * <th scope="col">Regex</th> 3169 * <th scope="col">Limit</th> 3170 * <th scope="col">Result</th> 3171 * </tr> 3172 * </thead> 3173 * <tbody> 3174 * <tr><th scope="row" rowspan="3" style="font-weight:normal">:</th> 3175 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">2</th> 3176 * <td>{@code { "boo", "and:foo" }}</td></tr> 3177 * <tr><!-- : --> 3178 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th> 3179 * <td>{@code { "boo", "and", "foo" }}</td></tr> 3180 * <tr><!-- : --> 3181 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-2</th> 3182 * <td>{@code { "boo", "and", "foo" }}</td></tr> 3183 * <tr><th scope="row" rowspan="3" style="font-weight:normal">o</th> 3184 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th> 3185 * <td>{@code { "b", "", ":and:f", "", "" }}</td></tr> 3186 * <tr><!-- o --> 3187 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-2</th> 3188 * <td>{@code { "b", "", ":and:f", "", "" }}</td></tr> 3189 * <tr><!-- o --> 3190 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">0</th> 3191 * <td>{@code { "b", "", ":and:f" }}</td></tr> 3192 * </tbody> 3193 * </table></blockquote> 3194 * 3195 * <p> An invocation of this method of the form 3196 * <i>str.</i>{@code split(}<i>regex</i>{@code ,} <i>n</i>{@code )} 3197 * yields the same result as the expression 3198 * 3199 * <blockquote> 3200 * <code> 3201 * {@link java.util.regex.Pattern}.{@link 3202 * java.util.regex.Pattern#compile(String) compile}(<i>regex</i>).{@link 3203 * java.util.regex.Pattern#split(java.lang.CharSequence,int) split}(<i>str</i>, <i>n</i>) 3204 * </code> 3205 * </blockquote> 3206 * 3207 * 3208 * @param regex 3209 * the delimiting regular expression 3210 * 3211 * @param limit 3212 * the result threshold, as described above 3213 * 3214 * @return the array of strings computed by splitting this string 3215 * around matches of the given regular expression 3216 * 3217 * @throws PatternSyntaxException 3218 * if the regular expression's syntax is invalid 3219 * 3220 * @see java.util.regex.Pattern 3221 * 3222 * @since 1.4 3223 */ 3224 public String[] split(String regex, int limit) { 3225 return split(regex, limit, false); 3226 } 3227 3228 /** 3229 * Splits this string around matches of the given regular expression and 3230 * returns both the strings and the matching delimiters. 3231 * 3232 * <p> The array returned by this method contains each substring of this 3233 * string that is terminated by another substring that matches the given 3234 * expression or is terminated by the end of the string. 3235 * Each substring is immediately followed by the subsequence (the delimiter) 3236 * that matches the given expression, <em>except</em> for the last 3237 * substring, which is not followed by anything. 3238 * The substrings in the array and the delimiters are in the order in which 3239 * they occur in the input. 3240 * If the expression does not match any part of the input then the resulting 3241 * array has just one element, namely this string. 3242 * 3243 * <p> When there is a positive-width match at the beginning of this 3244 * string then an empty leading substring is included at the beginning 3245 * of the resulting array. A zero-width match at the beginning however 3246 * never produces such empty leading substring nor the empty delimiter. 3247 * 3248 * <p> The {@code limit} parameter controls the number of times the 3249 * pattern is applied and therefore affects the length of the resulting 3250 * array. 3251 * <ul> 3252 * <li> If the <i>limit</i> is positive then the pattern will be applied 3253 * at most <i>limit</i> - 1 times, the array's length will be 3254 * no greater than 2 × <i>limit</i> - 1, and the array's last 3255 * entry will contain all input beyond the last matched delimiter.</li> 3256 * 3257 * <li> If the <i>limit</i> is zero then the pattern will be applied as 3258 * many times as possible, the array can have any length, and trailing 3259 * empty strings will be discarded.</li> 3260 * 3261 * <li> If the <i>limit</i> is negative then the pattern will be applied 3262 * as many times as possible and the array can have any length.</li> 3263 * </ul> 3264 * 3265 * <p> The input {@code "boo:::and::foo"}, for example, yields the following 3266 * results with these parameters: 3267 * 3268 * <table class="plain" style="margin-left:2em;"> 3269 * <caption style="display:none">Split example showing regex, limit, and result</caption> 3270 * <thead> 3271 * <tr> 3272 * <th scope="col">Regex</th> 3273 * <th scope="col">Limit</th> 3274 * <th scope="col">Result</th> 3275 * </tr> 3276 * </thead> 3277 * <tbody> 3278 * <tr><th scope="row" rowspan="3" style="font-weight:normal">:+</th> 3279 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">2</th> 3280 * <td>{@code { "boo", ":::", "and::foo" }}</td></tr> 3281 * <tr><!-- : --> 3282 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th> 3283 * <td>{@code { "boo", ":::", "and", "::", "foo" }}</td></tr> 3284 * <tr><!-- : --> 3285 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-1</th> 3286 * <td>{@code { "boo", ":::", "and", "::", "foo" }}</td></tr> 3287 * <tr><th scope="row" rowspan="3" style="font-weight:normal">o</th> 3288 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th> 3289 * <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o", "" }}</td></tr> 3290 * <tr><!-- o --> 3291 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-1</th> 3292 * <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o", "" }}</td></tr> 3293 * <tr><!-- o --> 3294 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">0</th> 3295 * <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o" }}</td></tr> 3296 * </tbody> 3297 * </table> 3298 * 3299 * @apiNote An invocation of this method of the form 3300 * <i>str.</i>{@code splitWithDelimiters(}<i>regex</i>{@code ,} <i>n</i>{@code )} 3301 * yields the same result as the expression 3302 * 3303 * <blockquote> 3304 * <code> 3305 * {@link java.util.regex.Pattern}.{@link 3306 * java.util.regex.Pattern#compile(String) compile}(<i>regex</i>).{@link 3307 * java.util.regex.Pattern#splitWithDelimiters(CharSequence,int) splitWithDelimiters}(<i>str</i>, <i>n</i>) 3308 * </code> 3309 * </blockquote> 3310 * 3311 * @param regex 3312 * the delimiting regular expression 3313 * 3314 * @param limit 3315 * the result threshold, as described above 3316 * 3317 * @return the array of strings computed by splitting this string 3318 * around matches of the given regular expression, alternating 3319 * substrings and matching delimiters 3320 * 3321 * @since 21 3322 */ 3323 public String[] splitWithDelimiters(String regex, int limit) { 3324 return split(regex, limit, true); 3325 } 3326 3327 private String[] split(String regex, int limit, boolean withDelimiters) { 3328 /* fastpath if the regex is a 3329 * (1) one-char String and this character is not one of the 3330 * RegEx's meta characters ".$|()[{^?*+\\", or 3331 * (2) two-char String and the first char is the backslash and 3332 * the second is not the ascii digit or ascii letter. 3333 */ 3334 char ch = 0; 3335 if (((regex.length() == 1 && 3336 ".$|()[{^?*+\\".indexOf(ch = regex.charAt(0)) == -1) || 3337 (regex.length() == 2 && 3338 regex.charAt(0) == '\\' && 3339 (((ch = regex.charAt(1))-'0')|('9'-ch)) < 0 && 3340 ((ch-'a')|('z'-ch)) < 0 && 3341 ((ch-'A')|('Z'-ch)) < 0)) && 3342 (ch < Character.MIN_HIGH_SURROGATE || 3343 ch > Character.MAX_LOW_SURROGATE)) 3344 { 3345 // All the checks above can potentially be constant folded by 3346 // a JIT/AOT compiler when the regex is a constant string. 3347 // That requires method inlining of the checks, which is only 3348 // possible when the actual split logic is in a separate method 3349 // because the large split loop can usually not be inlined. 3350 return split(ch, limit, withDelimiters); 3351 } 3352 Pattern pattern = Pattern.compile(regex); 3353 return withDelimiters 3354 ? pattern.splitWithDelimiters(this, limit) 3355 : pattern.split(this, limit); 3356 } 3357 3358 private String[] split(char ch, int limit, boolean withDelimiters) { 3359 int matchCount = 0; 3360 int off = 0; 3361 int next; 3362 boolean limited = limit > 0; 3363 ArrayList<String> list = new ArrayList<>(); 3364 String del = withDelimiters ? String.valueOf(ch) : null; 3365 while ((next = indexOf(ch, off)) != -1) { 3366 if (!limited || matchCount < limit - 1) { 3367 list.add(substring(off, next)); 3368 if (withDelimiters) { 3369 list.add(del); 3370 } 3371 off = next + 1; 3372 ++matchCount; 3373 } else { // last one 3374 int last = length(); 3375 list.add(substring(off, last)); 3376 off = last; 3377 ++matchCount; 3378 break; 3379 } 3380 } 3381 // If no match was found, return this 3382 if (off == 0) 3383 return new String[] {this}; 3384 3385 // Add remaining segment 3386 if (!limited || matchCount < limit) 3387 list.add(substring(off, length())); 3388 3389 // Construct result 3390 int resultSize = list.size(); 3391 if (limit == 0) { 3392 while (resultSize > 0 && list.get(resultSize - 1).isEmpty()) { 3393 resultSize--; 3394 } 3395 } 3396 String[] result = new String[resultSize]; 3397 return list.subList(0, resultSize).toArray(result); 3398 } 3399 3400 /** 3401 * Splits this string around matches of the given <a 3402 * href="../util/regex/Pattern.html#sum">regular expression</a>. 3403 * 3404 * <p> This method works as if by invoking the two-argument {@link 3405 * #split(String, int) split} method with the given expression and a limit 3406 * argument of zero. Trailing empty strings are therefore not included in 3407 * the resulting array. 3408 * 3409 * <p> The string {@code "boo:and:foo"}, for example, yields the following 3410 * results with these expressions: 3411 * 3412 * <blockquote><table class="plain"> 3413 * <caption style="display:none">Split examples showing regex and result</caption> 3414 * <thead> 3415 * <tr> 3416 * <th scope="col">Regex</th> 3417 * <th scope="col">Result</th> 3418 * </tr> 3419 * </thead> 3420 * <tbody> 3421 * <tr><th scope="row" style="text-weight:normal">:</th> 3422 * <td>{@code { "boo", "and", "foo" }}</td></tr> 3423 * <tr><th scope="row" style="text-weight:normal">o</th> 3424 * <td>{@code { "b", "", ":and:f" }}</td></tr> 3425 * </tbody> 3426 * </table></blockquote> 3427 * 3428 * 3429 * @param regex 3430 * the delimiting regular expression 3431 * 3432 * @return the array of strings computed by splitting this string 3433 * around matches of the given regular expression 3434 * 3435 * @throws PatternSyntaxException 3436 * if the regular expression's syntax is invalid 3437 * 3438 * @see java.util.regex.Pattern 3439 * 3440 * @since 1.4 3441 */ 3442 public String[] split(String regex) { 3443 return split(regex, 0, false); 3444 } 3445 3446 /** 3447 * Returns a new String composed of copies of the 3448 * {@code CharSequence elements} joined together with a copy of 3449 * the specified {@code delimiter}. 3450 * 3451 * <blockquote>For example, 3452 * <pre>{@code 3453 * String message = String.join("-", "Java", "is", "cool"); 3454 * // message returned is: "Java-is-cool" 3455 * }</pre></blockquote> 3456 * 3457 * Note that if an element is null, then {@code "null"} is added. 3458 * 3459 * @param delimiter the delimiter that separates each element 3460 * @param elements the elements to join together. 3461 * 3462 * @return a new {@code String} that is composed of the {@code elements} 3463 * separated by the {@code delimiter} 3464 * 3465 * @throws NullPointerException If {@code delimiter} or {@code elements} 3466 * is {@code null} 3467 * 3468 * @see java.util.StringJoiner 3469 * @since 1.8 3470 */ 3471 public static String join(CharSequence delimiter, CharSequence... elements) { 3472 var delim = delimiter.toString(); 3473 var elems = new String[elements.length]; 3474 for (int i = 0; i < elements.length; i++) { 3475 elems[i] = String.valueOf(elements[i]); 3476 } 3477 return join("", "", delim, elems, elems.length); 3478 } 3479 3480 /** 3481 * Designated join routine. 3482 * 3483 * @param prefix the non-null prefix 3484 * @param suffix the non-null suffix 3485 * @param delimiter the non-null delimiter 3486 * @param elements the non-null array of non-null elements 3487 * @param size the number of elements in the array (<= elements.length) 3488 * @return the joined string 3489 */ 3490 @ForceInline 3491 static String join(String prefix, String suffix, String delimiter, String[] elements, int size) { 3492 int icoder = prefix.coder() | suffix.coder(); 3493 long len = (long) prefix.length() + suffix.length(); 3494 if (size > 1) { // when there are more than one element, size - 1 delimiters will be emitted 3495 len += (long) (size - 1) * delimiter.length(); 3496 icoder |= delimiter.coder(); 3497 } 3498 // assert len > 0L; // max: (long) Integer.MAX_VALUE << 32 3499 // following loop will add max: (long) Integer.MAX_VALUE * Integer.MAX_VALUE to len 3500 // so len can overflow at most once 3501 for (int i = 0; i < size; i++) { 3502 var el = elements[i]; 3503 len += el.length(); 3504 icoder |= el.coder(); 3505 } 3506 byte coder = (byte) icoder; 3507 // long len overflow check, char -> byte length, int len overflow check 3508 if (len < 0L || (len <<= coder) != (int) len) { 3509 throw new OutOfMemoryError("Requested string length exceeds VM limit"); 3510 } 3511 byte[] value = StringConcatHelper.newArray(len); 3512 3513 int off = 0; 3514 prefix.getBytes(value, off, coder); off += prefix.length(); 3515 if (size > 0) { 3516 var el = elements[0]; 3517 el.getBytes(value, off, coder); off += el.length(); 3518 for (int i = 1; i < size; i++) { 3519 delimiter.getBytes(value, off, coder); off += delimiter.length(); 3520 el = elements[i]; 3521 el.getBytes(value, off, coder); off += el.length(); 3522 } 3523 } 3524 suffix.getBytes(value, off, coder); 3525 // assert off + suffix.length() == value.length >> coder; 3526 3527 return new String(value, coder); 3528 } 3529 3530 /** 3531 * Returns a new {@code String} composed of copies of the 3532 * {@code CharSequence elements} joined together with a copy of the 3533 * specified {@code delimiter}. 3534 * 3535 * <blockquote>For example, 3536 * <pre>{@code 3537 * List<String> strings = List.of("Java", "is", "cool"); 3538 * String message = String.join(" ", strings); 3539 * // message returned is: "Java is cool" 3540 * 3541 * Set<String> strings = 3542 * new LinkedHashSet<>(List.of("Java", "is", "very", "cool")); 3543 * String message = String.join("-", strings); 3544 * // message returned is: "Java-is-very-cool" 3545 * }</pre></blockquote> 3546 * 3547 * Note that if an individual element is {@code null}, then {@code "null"} is added. 3548 * 3549 * @param delimiter a sequence of characters that is used to separate each 3550 * of the {@code elements} in the resulting {@code String} 3551 * @param elements an {@code Iterable} that will have its {@code elements} 3552 * joined together. 3553 * 3554 * @return a new {@code String} that is composed from the {@code elements} 3555 * argument 3556 * 3557 * @throws NullPointerException If {@code delimiter} or {@code elements} 3558 * is {@code null} 3559 * 3560 * @see #join(CharSequence,CharSequence...) 3561 * @see java.util.StringJoiner 3562 * @since 1.8 3563 */ 3564 public static String join(CharSequence delimiter, 3565 Iterable<? extends CharSequence> elements) { 3566 Objects.requireNonNull(delimiter); 3567 Objects.requireNonNull(elements); 3568 var delim = delimiter.toString(); 3569 var elems = new String[8]; 3570 int size = 0; 3571 for (CharSequence cs: elements) { 3572 if (size >= elems.length) { 3573 elems = Arrays.copyOf(elems, elems.length << 1); 3574 } 3575 elems[size++] = String.valueOf(cs); 3576 } 3577 return join("", "", delim, elems, size); 3578 } 3579 3580 /** 3581 * Converts all of the characters in this {@code String} to lower 3582 * case using the rules of the given {@code Locale}. Case mapping is based 3583 * on the Unicode Standard version specified by the {@link java.lang.Character Character} 3584 * class. Since case mappings are not always 1:1 char mappings, the resulting {@code String} 3585 * and this {@code String} may differ in length. 3586 * <p> 3587 * Examples of lowercase mappings are in the following table: 3588 * <table class="plain"> 3589 * <caption style="display:none">Lowercase mapping examples showing language code of locale, upper case, lower case, and description</caption> 3590 * <thead> 3591 * <tr> 3592 * <th scope="col">Language Code of Locale</th> 3593 * <th scope="col">Upper Case</th> 3594 * <th scope="col">Lower Case</th> 3595 * <th scope="col">Description</th> 3596 * </tr> 3597 * </thead> 3598 * <tbody> 3599 * <tr> 3600 * <td>tr (Turkish)</td> 3601 * <th scope="row" style="font-weight:normal; text-align:left">\u0130</th> 3602 * <td>\u0069</td> 3603 * <td>capital letter I with dot above -> small letter i</td> 3604 * </tr> 3605 * <tr> 3606 * <td>tr (Turkish)</td> 3607 * <th scope="row" style="font-weight:normal; text-align:left">\u0049</th> 3608 * <td>\u0131</td> 3609 * <td>capital letter I -> small letter dotless i </td> 3610 * </tr> 3611 * <tr> 3612 * <td>(all)</td> 3613 * <th scope="row" style="font-weight:normal; text-align:left">French Fries</th> 3614 * <td>french fries</td> 3615 * <td>lowercased all chars in String</td> 3616 * </tr> 3617 * <tr> 3618 * <td>(all)</td> 3619 * <th scope="row" style="font-weight:normal; text-align:left"> 3620 * ΙΧΘΥΣ</th> 3621 * <td>ιχθυσ</td> 3622 * <td>lowercased all chars in String</td> 3623 * </tr> 3624 * </tbody> 3625 * </table> 3626 * 3627 * @param locale use the case transformation rules for this locale 3628 * @return the {@code String}, converted to lowercase. 3629 * @see java.lang.String#toLowerCase() 3630 * @see java.lang.String#toUpperCase() 3631 * @see java.lang.String#toUpperCase(Locale) 3632 * @since 1.1 3633 */ 3634 public String toLowerCase(Locale locale) { 3635 return isLatin1() ? StringLatin1.toLowerCase(this, value, locale) 3636 : StringUTF16.toLowerCase(this, value, locale); 3637 } 3638 3639 /** 3640 * Converts all of the characters in this {@code String} to lower 3641 * case using the rules of the default locale. This method is equivalent to 3642 * {@code toLowerCase(Locale.getDefault())}. 3643 * 3644 * @apiNote This method is locale sensitive, and may produce unexpected 3645 * results if used for strings that are intended to be interpreted locale 3646 * independently. 3647 * Examples are programming language identifiers, protocol keys, and HTML 3648 * tags. 3649 * For instance, {@code "TITLE".toLowerCase()} in a Turkish locale 3650 * returns {@code "t\u005Cu0131tle"}, where '\u005Cu0131' is the 3651 * LATIN SMALL LETTER DOTLESS I character. 3652 * To obtain correct results for locale insensitive strings, use 3653 * {@code toLowerCase(Locale.ROOT)}. 3654 * 3655 * @return the {@code String}, converted to lowercase. 3656 * @see java.lang.String#toLowerCase(Locale) 3657 */ 3658 public String toLowerCase() { 3659 return toLowerCase(Locale.getDefault()); 3660 } 3661 3662 /** 3663 * Converts all of the characters in this {@code String} to upper 3664 * case using the rules of the given {@code Locale}. Case mapping is based 3665 * on the Unicode Standard version specified by the {@link java.lang.Character Character} 3666 * class. Since case mappings are not always 1:1 char mappings, the resulting {@code String} 3667 * and this {@code String} may differ in length. 3668 * <p> 3669 * Examples of locale-sensitive and 1:M case mappings are in the following table: 3670 * <table class="plain"> 3671 * <caption style="display:none">Examples of locale-sensitive and 1:M case mappings. Shows Language code of locale, lower case, upper case, and description.</caption> 3672 * <thead> 3673 * <tr> 3674 * <th scope="col">Language Code of Locale</th> 3675 * <th scope="col">Lower Case</th> 3676 * <th scope="col">Upper Case</th> 3677 * <th scope="col">Description</th> 3678 * </tr> 3679 * </thead> 3680 * <tbody> 3681 * <tr> 3682 * <td>tr (Turkish)</td> 3683 * <th scope="row" style="font-weight:normal; text-align:left">\u0069</th> 3684 * <td>\u0130</td> 3685 * <td>small letter i -> capital letter I with dot above</td> 3686 * </tr> 3687 * <tr> 3688 * <td>tr (Turkish)</td> 3689 * <th scope="row" style="font-weight:normal; text-align:left">\u0131</th> 3690 * <td>\u0049</td> 3691 * <td>small letter dotless i -> capital letter I</td> 3692 * </tr> 3693 * <tr> 3694 * <td>(all)</td> 3695 * <th scope="row" style="font-weight:normal; text-align:left">\u00df</th> 3696 * <td>\u0053 \u0053</td> 3697 * <td>small letter sharp s -> two letters: SS</td> 3698 * </tr> 3699 * <tr> 3700 * <td>(all)</td> 3701 * <th scope="row" style="font-weight:normal; text-align:left">Fahrvergnügen</th> 3702 * <td>FAHRVERGNÜGEN</td> 3703 * <td></td> 3704 * </tr> 3705 * </tbody> 3706 * </table> 3707 * @param locale use the case transformation rules for this locale 3708 * @return the {@code String}, converted to uppercase. 3709 * @see java.lang.String#toUpperCase() 3710 * @see java.lang.String#toLowerCase() 3711 * @see java.lang.String#toLowerCase(Locale) 3712 * @since 1.1 3713 */ 3714 public String toUpperCase(Locale locale) { 3715 return isLatin1() ? StringLatin1.toUpperCase(this, value, locale) 3716 : StringUTF16.toUpperCase(this, value, locale); 3717 } 3718 3719 /** 3720 * Converts all of the characters in this {@code String} to upper 3721 * case using the rules of the default locale. This method is equivalent to 3722 * {@code toUpperCase(Locale.getDefault())}. 3723 * 3724 * @apiNote This method is locale sensitive, and may produce unexpected 3725 * results if used for strings that are intended to be interpreted locale 3726 * independently. 3727 * Examples are programming language identifiers, protocol keys, and HTML 3728 * tags. 3729 * For instance, {@code "title".toUpperCase()} in a Turkish locale 3730 * returns {@code "T\u005Cu0130TLE"}, where '\u005Cu0130' is the 3731 * LATIN CAPITAL LETTER I WITH DOT ABOVE character. 3732 * To obtain correct results for locale insensitive strings, use 3733 * {@code toUpperCase(Locale.ROOT)}. 3734 * 3735 * @return the {@code String}, converted to uppercase. 3736 * @see java.lang.String#toUpperCase(Locale) 3737 */ 3738 public String toUpperCase() { 3739 return toUpperCase(Locale.getDefault()); 3740 } 3741 3742 /** 3743 * Returns a string whose value is this string, with all leading 3744 * and trailing space removed, where space is defined 3745 * as any character whose codepoint is less than or equal to 3746 * {@code 'U+0020'} (the space character). 3747 * <p> 3748 * If this {@code String} object represents an empty character 3749 * sequence, or the first and last characters of character sequence 3750 * represented by this {@code String} object both have codes 3751 * that are not space (as defined above), then a 3752 * reference to this {@code String} object is returned. 3753 * <p> 3754 * Otherwise, if all characters in this string are space (as 3755 * defined above), then a {@code String} object representing an 3756 * empty string is returned. 3757 * <p> 3758 * Otherwise, let <i>k</i> be the index of the first character in the 3759 * string whose code is not a space (as defined above) and let 3760 * <i>m</i> be the index of the last character in the string whose code 3761 * is not a space (as defined above). A {@code String} 3762 * object is returned, representing the substring of this string that 3763 * begins with the character at index <i>k</i> and ends with the 3764 * character at index <i>m</i>-that is, the result of 3765 * {@code this.substring(k, m + 1)}. 3766 * <p> 3767 * This method may be used to trim space (as defined above) from 3768 * the beginning and end of a string. 3769 * 3770 * @return a string whose value is this string, with all leading 3771 * and trailing space removed, or this string if it 3772 * has no leading or trailing space. 3773 */ 3774 public String trim() { 3775 String ret = isLatin1() ? StringLatin1.trim(value) 3776 : StringUTF16.trim(value); 3777 return ret == null ? this : ret; 3778 } 3779 3780 /** 3781 * Returns a string whose value is this string, with all leading 3782 * and trailing {@linkplain Character#isWhitespace(int) white space} 3783 * removed. 3784 * <p> 3785 * If this {@code String} object represents an empty string, 3786 * or if all code points in this string are 3787 * {@linkplain Character#isWhitespace(int) white space}, then an empty string 3788 * is returned. 3789 * <p> 3790 * Otherwise, returns a substring of this string beginning with the first 3791 * code point that is not a {@linkplain Character#isWhitespace(int) white space} 3792 * up to and including the last code point that is not a 3793 * {@linkplain Character#isWhitespace(int) white space}. 3794 * <p> 3795 * This method may be used to strip 3796 * {@linkplain Character#isWhitespace(int) white space} from 3797 * the beginning and end of a string. 3798 * 3799 * @return a string whose value is this string, with all leading 3800 * and trailing white space removed 3801 * 3802 * @see Character#isWhitespace(int) 3803 * 3804 * @since 11 3805 */ 3806 public String strip() { 3807 String ret = isLatin1() ? StringLatin1.strip(value) 3808 : StringUTF16.strip(value); 3809 return ret == null ? this : ret; 3810 } 3811 3812 /** 3813 * Returns a string whose value is this string, with all leading 3814 * {@linkplain Character#isWhitespace(int) white space} removed. 3815 * <p> 3816 * If this {@code String} object represents an empty string, 3817 * or if all code points in this string are 3818 * {@linkplain Character#isWhitespace(int) white space}, then an empty string 3819 * is returned. 3820 * <p> 3821 * Otherwise, returns a substring of this string beginning with the first 3822 * code point that is not a {@linkplain Character#isWhitespace(int) white space} 3823 * up to and including the last code point of this string. 3824 * <p> 3825 * This method may be used to trim 3826 * {@linkplain Character#isWhitespace(int) white space} from 3827 * the beginning of a string. 3828 * 3829 * @return a string whose value is this string, with all leading white 3830 * space removed 3831 * 3832 * @see Character#isWhitespace(int) 3833 * 3834 * @since 11 3835 */ 3836 public String stripLeading() { 3837 String ret = isLatin1() ? StringLatin1.stripLeading(value) 3838 : StringUTF16.stripLeading(value); 3839 return ret == null ? this : ret; 3840 } 3841 3842 /** 3843 * Returns a string whose value is this string, with all trailing 3844 * {@linkplain Character#isWhitespace(int) white space} removed. 3845 * <p> 3846 * If this {@code String} object represents an empty string, 3847 * or if all characters in this string are 3848 * {@linkplain Character#isWhitespace(int) white space}, then an empty string 3849 * is returned. 3850 * <p> 3851 * Otherwise, returns a substring of this string beginning with the first 3852 * code point of this string up to and including the last code point 3853 * that is not a {@linkplain Character#isWhitespace(int) white space}. 3854 * <p> 3855 * This method may be used to trim 3856 * {@linkplain Character#isWhitespace(int) white space} from 3857 * the end of a string. 3858 * 3859 * @return a string whose value is this string, with all trailing white 3860 * space removed 3861 * 3862 * @see Character#isWhitespace(int) 3863 * 3864 * @since 11 3865 */ 3866 public String stripTrailing() { 3867 String ret = isLatin1() ? StringLatin1.stripTrailing(value) 3868 : StringUTF16.stripTrailing(value); 3869 return ret == null ? this : ret; 3870 } 3871 3872 /** 3873 * Returns {@code true} if the string is empty or contains only 3874 * {@linkplain Character#isWhitespace(int) white space} codepoints, 3875 * otherwise {@code false}. 3876 * 3877 * @return {@code true} if the string is empty or contains only 3878 * {@linkplain Character#isWhitespace(int) white space} codepoints, 3879 * otherwise {@code false} 3880 * 3881 * @see Character#isWhitespace(int) 3882 * 3883 * @since 11 3884 */ 3885 public boolean isBlank() { 3886 return indexOfNonWhitespace() == length(); 3887 } 3888 3889 /** 3890 * Returns a stream of lines extracted from this string, 3891 * separated by line terminators. 3892 * <p> 3893 * A <i>line terminator</i> is one of the following: 3894 * a line feed character {@code "\n"} (U+000A), 3895 * a carriage return character {@code "\r"} (U+000D), 3896 * or a carriage return followed immediately by a line feed 3897 * {@code "\r\n"} (U+000D U+000A). 3898 * <p> 3899 * A <i>line</i> is either a sequence of zero or more characters 3900 * followed by a line terminator, or it is a sequence of one or 3901 * more characters followed by the end of the string. A 3902 * line does not include the line terminator. 3903 * <p> 3904 * The stream returned by this method contains the lines from 3905 * this string in the order in which they occur. 3906 * 3907 * @apiNote This definition of <i>line</i> implies that an empty 3908 * string has zero lines and that there is no empty line 3909 * following a line terminator at the end of a string. 3910 * 3911 * @implNote This method provides better performance than 3912 * split("\R") by supplying elements lazily and 3913 * by faster search of new line terminators. 3914 * 3915 * @return the stream of lines extracted from this string 3916 * 3917 * @since 11 3918 */ 3919 public Stream<String> lines() { 3920 return isLatin1() ? StringLatin1.lines(value) : StringUTF16.lines(value); 3921 } 3922 3923 /** 3924 * Adjusts the indentation of each line of this string based on the value of 3925 * {@code n}, and normalizes line termination characters. 3926 * <p> 3927 * This string is conceptually separated into lines using 3928 * {@link String#lines()}. Each line is then adjusted as described below 3929 * and then suffixed with a line feed {@code "\n"} (U+000A). The resulting 3930 * lines are then concatenated and returned. 3931 * <p> 3932 * If {@code n > 0} then {@code n} spaces (U+0020) are inserted at the 3933 * beginning of each line. 3934 * <p> 3935 * If {@code n < 0} then up to {@code n} 3936 * {@linkplain Character#isWhitespace(int) white space characters} are removed 3937 * from the beginning of each line. If a given line does not contain 3938 * sufficient white space then all leading 3939 * {@linkplain Character#isWhitespace(int) white space characters} are removed. 3940 * Each white space character is treated as a single character. In 3941 * particular, the tab character {@code "\t"} (U+0009) is considered a 3942 * single character; it is not expanded. 3943 * <p> 3944 * If {@code n == 0} then the line remains unchanged. However, line 3945 * terminators are still normalized. 3946 * 3947 * @param n number of leading 3948 * {@linkplain Character#isWhitespace(int) white space characters} 3949 * to add or remove 3950 * 3951 * @return string with indentation adjusted and line endings normalized 3952 * 3953 * @see String#lines() 3954 * @see String#isBlank() 3955 * @see Character#isWhitespace(int) 3956 * 3957 * @since 12 3958 */ 3959 public String indent(int n) { 3960 if (isEmpty()) { 3961 return ""; 3962 } 3963 Stream<String> stream = lines(); 3964 if (n > 0) { 3965 final String spaces = " ".repeat(n); 3966 stream = stream.map(s -> spaces + s); 3967 } else if (n == Integer.MIN_VALUE) { 3968 stream = stream.map(s -> s.stripLeading()); 3969 } else if (n < 0) { 3970 stream = stream.map(s -> s.substring(Math.min(-n, s.indexOfNonWhitespace()))); 3971 } 3972 return stream.collect(Collectors.joining("\n", "", "\n")); 3973 } 3974 3975 private int indexOfNonWhitespace() { 3976 return isLatin1() ? StringLatin1.indexOfNonWhitespace(value) 3977 : StringUTF16.indexOfNonWhitespace(value); 3978 } 3979 3980 private int lastIndexOfNonWhitespace() { 3981 return isLatin1() ? StringLatin1.lastIndexOfNonWhitespace(value) 3982 : StringUTF16.lastIndexOfNonWhitespace(value); 3983 } 3984 3985 /** 3986 * Returns a string whose value is this string, with incidental 3987 * {@linkplain Character#isWhitespace(int) white space} removed from 3988 * the beginning and end of every line. 3989 * <p> 3990 * Incidental {@linkplain Character#isWhitespace(int) white space} 3991 * is often present in a text block to align the content with the opening 3992 * delimiter. For example, in the following code, dots represent incidental 3993 * {@linkplain Character#isWhitespace(int) white space}: 3994 * <blockquote><pre> 3995 * String html = """ 3996 * ..............<html> 3997 * .............. <body> 3998 * .............. <p>Hello, world</p> 3999 * .............. </body> 4000 * ..............</html> 4001 * .............."""; 4002 * </pre></blockquote> 4003 * This method treats the incidental 4004 * {@linkplain Character#isWhitespace(int) white space} as indentation to be 4005 * stripped, producing a string that preserves the relative indentation of 4006 * the content. Using | to visualize the start of each line of the string: 4007 * <blockquote><pre> 4008 * |<html> 4009 * | <body> 4010 * | <p>Hello, world</p> 4011 * | </body> 4012 * |</html> 4013 * </pre></blockquote> 4014 * First, the individual lines of this string are extracted. A <i>line</i> 4015 * is a sequence of zero or more characters followed by either a line 4016 * terminator or the end of the string. 4017 * If the string has at least one line terminator, the last line consists 4018 * of the characters between the last terminator and the end of the string. 4019 * Otherwise, if the string has no terminators, the last line is the start 4020 * of the string to the end of the string, in other words, the entire 4021 * string. 4022 * A line does not include the line terminator. 4023 * <p> 4024 * Then, the <i>minimum indentation</i> (min) is determined as follows: 4025 * <ul> 4026 * <li><p>For each non-blank line (as defined by {@link String#isBlank()}), 4027 * the leading {@linkplain Character#isWhitespace(int) white space} 4028 * characters are counted.</p> 4029 * </li> 4030 * <li><p>The leading {@linkplain Character#isWhitespace(int) white space} 4031 * characters on the last line are also counted even if 4032 * {@linkplain String#isBlank() blank}.</p> 4033 * </li> 4034 * </ul> 4035 * <p>The <i>min</i> value is the smallest of these counts. 4036 * <p> 4037 * For each {@linkplain String#isBlank() non-blank} line, <i>min</i> leading 4038 * {@linkplain Character#isWhitespace(int) white space} characters are 4039 * removed, and any trailing {@linkplain Character#isWhitespace(int) white 4040 * space} characters are removed. {@linkplain String#isBlank() Blank} lines 4041 * are replaced with the empty string. 4042 * 4043 * <p> 4044 * Finally, the lines are joined into a new string, using the LF character 4045 * {@code "\n"} (U+000A) to separate lines. 4046 * 4047 * @apiNote 4048 * This method's primary purpose is to shift a block of lines as far as 4049 * possible to the left, while preserving relative indentation. Lines 4050 * that were indented the least will thus have no leading 4051 * {@linkplain Character#isWhitespace(int) white space}. 4052 * The result will have the same number of line terminators as this string. 4053 * If this string ends with a line terminator then the result will end 4054 * with a line terminator. 4055 * 4056 * @implSpec 4057 * This method treats all {@linkplain Character#isWhitespace(int) white space} 4058 * characters as having equal width. As long as the indentation on every 4059 * line is consistently composed of the same character sequences, then the 4060 * result will be as described above. 4061 * 4062 * @return string with incidental indentation removed and line 4063 * terminators normalized 4064 * 4065 * @see String#lines() 4066 * @see String#isBlank() 4067 * @see String#indent(int) 4068 * @see Character#isWhitespace(int) 4069 * 4070 * @since 15 4071 * 4072 */ 4073 public String stripIndent() { 4074 int length = length(); 4075 if (length == 0) { 4076 return ""; 4077 } 4078 char lastChar = charAt(length - 1); 4079 boolean optOut = lastChar == '\n' || lastChar == '\r'; 4080 List<String> lines = lines().toList(); 4081 final int outdent = optOut ? 0 : outdent(lines); 4082 return lines.stream() 4083 .map(line -> { 4084 int firstNonWhitespace = line.indexOfNonWhitespace(); 4085 int lastNonWhitespace = line.lastIndexOfNonWhitespace(); 4086 int incidentalWhitespace = Math.min(outdent, firstNonWhitespace); 4087 return firstNonWhitespace > lastNonWhitespace 4088 ? "" : line.substring(incidentalWhitespace, lastNonWhitespace); 4089 }) 4090 .collect(Collectors.joining("\n", "", optOut ? "\n" : "")); 4091 } 4092 4093 private static int outdent(List<String> lines) { 4094 // Note: outdent is guaranteed to be zero or positive number. 4095 // If there isn't a non-blank line then the last must be blank 4096 int outdent = Integer.MAX_VALUE; 4097 for (String line : lines) { 4098 int leadingWhitespace = line.indexOfNonWhitespace(); 4099 if (leadingWhitespace != line.length()) { 4100 outdent = Integer.min(outdent, leadingWhitespace); 4101 } 4102 } 4103 String lastLine = lines.get(lines.size() - 1); 4104 if (lastLine.isBlank()) { 4105 outdent = Integer.min(outdent, lastLine.length()); 4106 } 4107 return outdent; 4108 } 4109 4110 /** 4111 * Returns a string whose value is this string, with escape sequences 4112 * translated as if in a string literal. 4113 * <p> 4114 * Escape sequences are translated as follows; 4115 * <table class="striped"> 4116 * <caption style="display:none">Translation</caption> 4117 * <thead> 4118 * <tr> 4119 * <th scope="col">Escape</th> 4120 * <th scope="col">Name</th> 4121 * <th scope="col">Translation</th> 4122 * </tr> 4123 * </thead> 4124 * <tbody> 4125 * <tr> 4126 * <th scope="row">{@code \u005Cb}</th> 4127 * <td>backspace</td> 4128 * <td>{@code U+0008}</td> 4129 * </tr> 4130 * <tr> 4131 * <th scope="row">{@code \u005Ct}</th> 4132 * <td>horizontal tab</td> 4133 * <td>{@code U+0009}</td> 4134 * </tr> 4135 * <tr> 4136 * <th scope="row">{@code \u005Cn}</th> 4137 * <td>line feed</td> 4138 * <td>{@code U+000A}</td> 4139 * </tr> 4140 * <tr> 4141 * <th scope="row">{@code \u005Cf}</th> 4142 * <td>form feed</td> 4143 * <td>{@code U+000C}</td> 4144 * </tr> 4145 * <tr> 4146 * <th scope="row">{@code \u005Cr}</th> 4147 * <td>carriage return</td> 4148 * <td>{@code U+000D}</td> 4149 * </tr> 4150 * <tr> 4151 * <th scope="row">{@code \u005Cs}</th> 4152 * <td>space</td> 4153 * <td>{@code U+0020}</td> 4154 * </tr> 4155 * <tr> 4156 * <th scope="row">{@code \u005C"}</th> 4157 * <td>double quote</td> 4158 * <td>{@code U+0022}</td> 4159 * </tr> 4160 * <tr> 4161 * <th scope="row">{@code \u005C'}</th> 4162 * <td>single quote</td> 4163 * <td>{@code U+0027}</td> 4164 * </tr> 4165 * <tr> 4166 * <th scope="row">{@code \u005C\u005C}</th> 4167 * <td>backslash</td> 4168 * <td>{@code U+005C}</td> 4169 * </tr> 4170 * <tr> 4171 * <th scope="row">{@code \u005C0 - \u005C377}</th> 4172 * <td>octal escape</td> 4173 * <td>code point equivalents</td> 4174 * </tr> 4175 * <tr> 4176 * <th scope="row">{@code \u005C<line-terminator>}</th> 4177 * <td>continuation</td> 4178 * <td>discard</td> 4179 * </tr> 4180 * </tbody> 4181 * </table> 4182 * 4183 * @implNote 4184 * This method does <em>not</em> translate Unicode escapes such as "{@code \u005cu2022}". 4185 * Unicode escapes are translated by the Java compiler when reading input characters and 4186 * are not part of the string literal specification. 4187 * 4188 * @throws IllegalArgumentException when an escape sequence is malformed. 4189 * 4190 * @return String with escape sequences translated. 4191 * 4192 * @jls 3.10.7 Escape Sequences 4193 * 4194 * @since 15 4195 */ 4196 public String translateEscapes() { 4197 if (isEmpty()) { 4198 return ""; 4199 } 4200 char[] chars = toCharArray(); 4201 int length = chars.length; 4202 int from = 0; 4203 int to = 0; 4204 while (from < length) { 4205 char ch = chars[from++]; 4206 if (ch == '\\') { 4207 ch = from < length ? chars[from++] : '\0'; 4208 switch (ch) { 4209 case 'b': 4210 ch = '\b'; 4211 break; 4212 case 'f': 4213 ch = '\f'; 4214 break; 4215 case 'n': 4216 ch = '\n'; 4217 break; 4218 case 'r': 4219 ch = '\r'; 4220 break; 4221 case 's': 4222 ch = ' '; 4223 break; 4224 case 't': 4225 ch = '\t'; 4226 break; 4227 case '\'': 4228 case '\"': 4229 case '\\': 4230 // as is 4231 break; 4232 case '0': case '1': case '2': case '3': 4233 case '4': case '5': case '6': case '7': 4234 int limit = Integer.min(from + (ch <= '3' ? 2 : 1), length); 4235 int code = ch - '0'; 4236 while (from < limit) { 4237 ch = chars[from]; 4238 if (ch < '0' || '7' < ch) { 4239 break; 4240 } 4241 from++; 4242 code = (code << 3) | (ch - '0'); 4243 } 4244 ch = (char)code; 4245 break; 4246 case '\n': 4247 continue; 4248 case '\r': 4249 if (from < length && chars[from] == '\n') { 4250 from++; 4251 } 4252 continue; 4253 default: { 4254 String msg = String.format( 4255 "Invalid escape sequence: \\%c \\\\u%04X", 4256 ch, (int)ch); 4257 throw new IllegalArgumentException(msg); 4258 } 4259 } 4260 } 4261 4262 chars[to++] = ch; 4263 } 4264 4265 return new String(chars, 0, to); 4266 } 4267 4268 /** 4269 * This method allows the application of a function to {@code this} 4270 * string. The function should expect a single String argument 4271 * and produce an {@code R} result. 4272 * <p> 4273 * Any exception thrown by {@code f.apply()} will be propagated to the 4274 * caller. 4275 * 4276 * @param f a function to apply 4277 * 4278 * @param <R> the type of the result 4279 * 4280 * @return the result of applying the function to this string 4281 * 4282 * @see java.util.function.Function 4283 * 4284 * @since 12 4285 */ 4286 public <R> R transform(Function<? super String, ? extends R> f) { 4287 return f.apply(this); 4288 } 4289 4290 /** 4291 * This object (which is already a string!) is itself returned. 4292 * 4293 * @return the string itself. 4294 */ 4295 public String toString() { 4296 return this; 4297 } 4298 4299 /** 4300 * Returns a stream of {@code int} zero-extending the {@code char} values 4301 * from this sequence. Any char which maps to a {@linkplain 4302 * Character##unicode surrogate code point} is passed through 4303 * uninterpreted. 4304 * 4305 * @return an IntStream of char values from this sequence 4306 * @since 9 4307 */ 4308 @Override 4309 public IntStream chars() { 4310 return StreamSupport.intStream( 4311 isLatin1() ? new StringLatin1.CharsSpliterator(value, Spliterator.IMMUTABLE) 4312 : new StringUTF16.CharsSpliterator(value, Spliterator.IMMUTABLE), 4313 false); 4314 } 4315 4316 4317 /** 4318 * Returns a stream of code point values from this sequence. Any surrogate 4319 * pairs encountered in the sequence are combined as if by {@linkplain 4320 * Character#toCodePoint Character.toCodePoint} and the result is passed 4321 * to the stream. Any other code units, including ordinary BMP characters, 4322 * unpaired surrogates, and undefined code units, are zero-extended to 4323 * {@code int} values which are then passed to the stream. 4324 * 4325 * @return an IntStream of Unicode code points from this sequence 4326 * @since 9 4327 */ 4328 @Override 4329 public IntStream codePoints() { 4330 return StreamSupport.intStream( 4331 isLatin1() ? new StringLatin1.CharsSpliterator(value, Spliterator.IMMUTABLE) 4332 : new StringUTF16.CodePointsSpliterator(value, Spliterator.IMMUTABLE), 4333 false); 4334 } 4335 4336 /** 4337 * Converts this string to a new character array. 4338 * 4339 * @return a newly allocated character array whose length is the length 4340 * of this string and whose contents are initialized to contain 4341 * the character sequence represented by this string. 4342 */ 4343 public char[] toCharArray() { 4344 return isLatin1() ? StringLatin1.toChars(value) 4345 : StringUTF16.toChars(value); 4346 } 4347 4348 /** 4349 * Returns a formatted string using the specified format string and 4350 * arguments. 4351 * 4352 * <p> The locale always used is the one returned by {@link 4353 * java.util.Locale#getDefault(java.util.Locale.Category) 4354 * Locale.getDefault(Locale.Category)} with 4355 * {@link java.util.Locale.Category#FORMAT FORMAT} category specified. 4356 * 4357 * @param format 4358 * A <a href="../util/Formatter.html#syntax">format string</a> 4359 * 4360 * @param args 4361 * Arguments referenced by the format specifiers in the format 4362 * string. If there are more arguments than format specifiers, the 4363 * extra arguments are ignored. The number of arguments is 4364 * variable and may be zero. The maximum number of arguments is 4365 * limited by the maximum dimension of a Java array as defined by 4366 * <cite>The Java Virtual Machine Specification</cite>. 4367 * The behaviour on a 4368 * {@code null} argument depends on the <a 4369 * href="../util/Formatter.html#syntax">conversion</a>. 4370 * 4371 * @throws java.util.IllegalFormatException 4372 * If a format string contains an illegal syntax, a format 4373 * specifier that is incompatible with the given arguments, 4374 * insufficient arguments given the format string, or other 4375 * illegal conditions. For specification of all possible 4376 * formatting errors, see the <a 4377 * href="../util/Formatter.html#detail">Details</a> section of the 4378 * formatter class specification. 4379 * 4380 * @return A formatted string 4381 * 4382 * @see java.util.Formatter 4383 * @since 1.5 4384 */ 4385 public static String format(String format, Object... args) { 4386 return new Formatter().format(format, args).toString(); 4387 } 4388 4389 /** 4390 * Returns a formatted string using the specified locale, format string, 4391 * and arguments. 4392 * 4393 * @param l 4394 * The {@linkplain java.util.Locale locale} to apply during 4395 * formatting. If {@code l} is {@code null} then no localization 4396 * is applied. 4397 * 4398 * @param format 4399 * A <a href="../util/Formatter.html#syntax">format string</a> 4400 * 4401 * @param args 4402 * Arguments referenced by the format specifiers in the format 4403 * string. If there are more arguments than format specifiers, the 4404 * extra arguments are ignored. The number of arguments is 4405 * variable and may be zero. The maximum number of arguments is 4406 * limited by the maximum dimension of a Java array as defined by 4407 * <cite>The Java Virtual Machine Specification</cite>. 4408 * The behaviour on a 4409 * {@code null} argument depends on the 4410 * <a href="../util/Formatter.html#syntax">conversion</a>. 4411 * 4412 * @throws java.util.IllegalFormatException 4413 * If a format string contains an illegal syntax, a format 4414 * specifier that is incompatible with the given arguments, 4415 * insufficient arguments given the format string, or other 4416 * illegal conditions. For specification of all possible 4417 * formatting errors, see the <a 4418 * href="../util/Formatter.html#detail">Details</a> section of the 4419 * formatter class specification 4420 * 4421 * @return A formatted string 4422 * 4423 * @see java.util.Formatter 4424 * @since 1.5 4425 */ 4426 public static String format(Locale l, String format, Object... args) { 4427 return new Formatter(l).format(format, args).toString(); 4428 } 4429 4430 /** 4431 * Formats using this string as the format string, and the supplied 4432 * arguments. 4433 * 4434 * @implSpec This method is equivalent to {@code String.format(this, args)}. 4435 * 4436 * @param args 4437 * Arguments referenced by the format specifiers in this string. 4438 * 4439 * @return A formatted string 4440 * 4441 * @see java.lang.String#format(String,Object...) 4442 * @see java.util.Formatter 4443 * 4444 * @since 15 4445 * 4446 */ 4447 public String formatted(Object... args) { 4448 return new Formatter().format(this, args).toString(); 4449 } 4450 4451 /** 4452 * Returns the string representation of the {@code Object} argument. 4453 * 4454 * @param obj an {@code Object}. 4455 * @return if the argument is {@code null}, then a string equal to 4456 * {@code "null"}; otherwise, the value of 4457 * {@code obj.toString()} is returned. 4458 * @see java.lang.Object#toString() 4459 */ 4460 public static String valueOf(Object obj) { 4461 return (obj == null) ? "null" : obj.toString(); 4462 } 4463 4464 /** 4465 * Returns the string representation of the {@code char} array 4466 * argument. The contents of the character array are copied; subsequent 4467 * modification of the character array does not affect the returned 4468 * string. 4469 * 4470 * @param data the character array. 4471 * @return a {@code String} that contains the characters of the 4472 * character array. 4473 */ 4474 public static String valueOf(char[] data) { 4475 return new String(data); 4476 } 4477 4478 /** 4479 * Returns the string representation of a specific subarray of the 4480 * {@code char} array argument. 4481 * <p> 4482 * The {@code offset} argument is the index of the first 4483 * character of the subarray. The {@code count} argument 4484 * specifies the length of the subarray. The contents of the subarray 4485 * are copied; subsequent modification of the character array does not 4486 * affect the returned string. 4487 * 4488 * @param data the character array. 4489 * @param offset initial offset of the subarray. 4490 * @param count length of the subarray. 4491 * @return a {@code String} that contains the characters of the 4492 * specified subarray of the character array. 4493 * @throws IndexOutOfBoundsException if {@code offset} is 4494 * negative, or {@code count} is negative, or 4495 * {@code offset+count} is larger than 4496 * {@code data.length}. 4497 */ 4498 public static String valueOf(char[] data, int offset, int count) { 4499 return new String(data, offset, count); 4500 } 4501 4502 /** 4503 * Equivalent to {@link #valueOf(char[], int, int)}. 4504 * 4505 * @param data the character array. 4506 * @param offset initial offset of the subarray. 4507 * @param count length of the subarray. 4508 * @return a {@code String} that contains the characters of the 4509 * specified subarray of the character array. 4510 * @throws IndexOutOfBoundsException if {@code offset} is 4511 * negative, or {@code count} is negative, or 4512 * {@code offset+count} is larger than 4513 * {@code data.length}. 4514 */ 4515 public static String copyValueOf(char[] data, int offset, int count) { 4516 return new String(data, offset, count); 4517 } 4518 4519 /** 4520 * Equivalent to {@link #valueOf(char[])}. 4521 * 4522 * @param data the character array. 4523 * @return a {@code String} that contains the characters of the 4524 * character array. 4525 */ 4526 public static String copyValueOf(char[] data) { 4527 return new String(data); 4528 } 4529 4530 /** 4531 * Returns the string representation of the {@code boolean} argument. 4532 * 4533 * @param b a {@code boolean}. 4534 * @return if the argument is {@code true}, a string equal to 4535 * {@code "true"} is returned; otherwise, a string equal to 4536 * {@code "false"} is returned. 4537 */ 4538 public static String valueOf(boolean b) { 4539 return b ? "true" : "false"; 4540 } 4541 4542 /** 4543 * Returns the string representation of the {@code char} 4544 * argument. 4545 * 4546 * @param c a {@code char}. 4547 * @return a string of length {@code 1} containing 4548 * as its single character the argument {@code c}. 4549 */ 4550 public static String valueOf(char c) { 4551 if (COMPACT_STRINGS && StringLatin1.canEncode(c)) { 4552 return new String(StringLatin1.toBytes(c), LATIN1); 4553 } 4554 return new String(StringUTF16.toBytes(c), UTF16); 4555 } 4556 4557 /** 4558 * Returns the string representation of the {@code int} argument. 4559 * <p> 4560 * The representation is exactly the one returned by the 4561 * {@code Integer.toString} method of one argument. 4562 * 4563 * @param i an {@code int}. 4564 * @return a string representation of the {@code int} argument. 4565 * @see java.lang.Integer#toString(int, int) 4566 */ 4567 public static String valueOf(int i) { 4568 return Integer.toString(i); 4569 } 4570 4571 /** 4572 * Returns the string representation of the {@code long} argument. 4573 * <p> 4574 * The representation is exactly the one returned by the 4575 * {@code Long.toString} method of one argument. 4576 * 4577 * @param l a {@code long}. 4578 * @return a string representation of the {@code long} argument. 4579 * @see java.lang.Long#toString(long) 4580 */ 4581 public static String valueOf(long l) { 4582 return Long.toString(l); 4583 } 4584 4585 /** 4586 * Returns the string representation of the {@code float} argument. 4587 * <p> 4588 * The representation is exactly the one returned by the 4589 * {@code Float.toString} method of one argument. 4590 * 4591 * @param f a {@code float}. 4592 * @return a string representation of the {@code float} argument. 4593 * @see java.lang.Float#toString(float) 4594 */ 4595 public static String valueOf(float f) { 4596 return Float.toString(f); 4597 } 4598 4599 /** 4600 * Returns the string representation of the {@code double} argument. 4601 * <p> 4602 * The representation is exactly the one returned by the 4603 * {@code Double.toString} method of one argument. 4604 * 4605 * @param d a {@code double}. 4606 * @return a string representation of the {@code double} argument. 4607 * @see java.lang.Double#toString(double) 4608 */ 4609 public static String valueOf(double d) { 4610 return Double.toString(d); 4611 } 4612 4613 /** 4614 * Returns a canonical representation for the string object. 4615 * <p> 4616 * A pool of strings, initially empty, is maintained privately by the 4617 * class {@code String}. 4618 * <p> 4619 * When the intern method is invoked, if the pool already contains a 4620 * string equal to this {@code String} object as determined by 4621 * the {@link #equals(Object)} method, then the string from the pool is 4622 * returned. Otherwise, this {@code String} object is added to the 4623 * pool and a reference to this {@code String} object is returned. 4624 * <p> 4625 * It follows that for any two strings {@code s} and {@code t}, 4626 * {@code s.intern() == t.intern()} is {@code true} 4627 * if and only if {@code s.equals(t)} is {@code true}. 4628 * <p> 4629 * All literal strings and string-valued constant expressions are 4630 * interned. String literals are defined in section {@jls 3.10.5} of the 4631 * <cite>The Java Language Specification</cite>. 4632 * 4633 * @return a string that has the same contents as this string, but is 4634 * guaranteed to be from a pool of unique strings. 4635 */ 4636 public native String intern(); 4637 4638 /** 4639 * Returns a string whose value is the concatenation of this 4640 * string repeated {@code count} times. 4641 * <p> 4642 * If this string is empty or count is zero then the empty 4643 * string is returned. 4644 * 4645 * @param count number of times to repeat 4646 * 4647 * @return A string composed of this string repeated 4648 * {@code count} times or the empty string if this 4649 * string is empty or count is zero 4650 * 4651 * @throws IllegalArgumentException if the {@code count} is 4652 * negative. 4653 * 4654 * @since 11 4655 */ 4656 public String repeat(int count) { 4657 if (count < 0) { 4658 throw new IllegalArgumentException("count is negative: " + count); 4659 } 4660 if (count == 1) { 4661 return this; 4662 } 4663 final int len = value.length; 4664 if (len == 0 || count == 0) { 4665 return ""; 4666 } 4667 if (Integer.MAX_VALUE / count < len) { 4668 throw new OutOfMemoryError("Required length exceeds implementation limit"); 4669 } 4670 if (len == 1) { 4671 final byte[] single = new byte[count]; 4672 Arrays.fill(single, value[0]); 4673 return new String(single, coder); 4674 } 4675 final int limit = len * count; 4676 final byte[] multiple = new byte[limit]; 4677 System.arraycopy(value, 0, multiple, 0, len); 4678 repeatCopyRest(multiple, 0, limit, len); 4679 return new String(multiple, coder); 4680 } 4681 4682 /** 4683 * Used to perform copying after the initial insertion. Copying is optimized 4684 * by using power of two duplication. First pass duplicates original copy, 4685 * second pass then duplicates the original and the copy yielding four copies, 4686 * third pass duplicates four copies yielding eight copies, and so on. 4687 * Finally, the remainder is filled in with prior copies. 4688 * 4689 * @implNote The technique used here is significantly faster than hand-rolled 4690 * loops or special casing small numbers due to the intensive optimization 4691 * done by intrinsic {@code System.arraycopy}. 4692 * 4693 * @param buffer destination buffer 4694 * @param offset offset in the destination buffer 4695 * @param limit total replicated including what is already in the buffer 4696 * @param copied number of bytes that have already in the buffer 4697 */ 4698 static void repeatCopyRest(byte[] buffer, int offset, int limit, int copied) { 4699 // Initial copy is in the buffer. 4700 for (; copied < limit - copied; copied <<= 1) { 4701 // Power of two duplicate. 4702 System.arraycopy(buffer, offset, buffer, offset + copied, copied); 4703 } 4704 // Duplicate remainder. 4705 System.arraycopy(buffer, offset, buffer, offset + copied, limit - copied); 4706 } 4707 4708 //////////////////////////////////////////////////////////////// 4709 4710 /** 4711 * Copy character bytes from this string into dst starting at dstBegin. 4712 * This method doesn't perform any range checking. 4713 * 4714 * Invoker guarantees: dst is in UTF16 (inflate itself for asb), if two 4715 * coders are different, and dst is big enough (range check) 4716 * 4717 * @param dstBegin the char index, not offset of byte[] 4718 * @param coder the coder of dst[] 4719 */ 4720 void getBytes(byte[] dst, int dstBegin, byte coder) { 4721 if (coder() == coder) { 4722 System.arraycopy(value, 0, dst, dstBegin << coder, value.length); 4723 } else { // this.coder == LATIN && coder == UTF16 4724 StringLatin1.inflate(value, 0, dst, dstBegin, value.length); 4725 } 4726 } 4727 4728 /** 4729 * Copy character bytes from this string into dst starting at dstBegin. 4730 * This method doesn't perform any range checking. 4731 * 4732 * Invoker guarantees: dst is in UTF16 (inflate itself for asb), if two 4733 * coders are different, and dst is big enough (range check) 4734 * 4735 * @param srcPos the char index, not offset of byte[] 4736 * @param dstBegin the char index to start from 4737 * @param coder the coder of dst[] 4738 * @param length the amount of copied chars 4739 */ 4740 void getBytes(byte[] dst, int srcPos, int dstBegin, byte coder, int length) { 4741 if (coder() == coder) { 4742 System.arraycopy(value, srcPos << coder, dst, dstBegin << coder, length << coder); 4743 } else { // this.coder == LATIN && coder == UTF16 4744 StringLatin1.inflate(value, srcPos, dst, dstBegin, length); 4745 } 4746 } 4747 4748 /* 4749 * Package private constructor. Trailing Void argument is there for 4750 * disambiguating it against other (public) constructors. 4751 * 4752 * Stores the char[] value into a byte[] that each byte represents 4753 * the8 low-order bits of the corresponding character, if the char[] 4754 * contains only latin1 character. Or a byte[] that stores all 4755 * characters in their byte sequences defined by the {@code StringUTF16}. 4756 */ 4757 String(char[] value, int off, int len, Void sig) { 4758 if (len == 0) { 4759 this.value = "".value; 4760 this.coder = "".coder; 4761 return; 4762 } 4763 if (COMPACT_STRINGS) { 4764 byte[] val = StringUTF16.compress(value, off, len); 4765 if (val != null) { 4766 this.value = val; 4767 this.coder = LATIN1; 4768 return; 4769 } 4770 } 4771 this.coder = UTF16; 4772 this.value = StringUTF16.toBytes(value, off, len); 4773 } 4774 4775 /* 4776 * Package private constructor. Trailing Void argument is there for 4777 * disambiguating it against other (public) constructors. 4778 */ 4779 String(AbstractStringBuilder asb, Void sig) { 4780 byte[] val = asb.getValue(); 4781 int length = asb.length(); 4782 if (asb.isLatin1()) { 4783 this.coder = LATIN1; 4784 this.value = Arrays.copyOfRange(val, 0, length); 4785 } else { 4786 // only try to compress val if some characters were deleted. 4787 if (COMPACT_STRINGS && asb.maybeLatin1) { 4788 byte[] buf = StringUTF16.compress(val, 0, length); 4789 if (buf != null) { 4790 this.coder = LATIN1; 4791 this.value = buf; 4792 return; 4793 } 4794 } 4795 this.coder = UTF16; 4796 this.value = Arrays.copyOfRange(val, 0, length << 1); 4797 } 4798 } 4799 4800 /* 4801 * Package private constructor which shares value array for speed. 4802 */ 4803 String(byte[] value, byte coder) { 4804 this.value = value; 4805 this.coder = coder; 4806 } 4807 4808 byte coder() { 4809 return COMPACT_STRINGS ? coder : UTF16; 4810 } 4811 4812 byte[] value() { 4813 return value; 4814 } 4815 4816 boolean isLatin1() { 4817 return COMPACT_STRINGS && coder == LATIN1; 4818 } 4819 4820 @Native static final byte LATIN1 = 0; 4821 @Native static final byte UTF16 = 1; 4822 4823 /* 4824 * StringIndexOutOfBoundsException if {@code index} is 4825 * negative or greater than or equal to {@code length}. 4826 */ 4827 static void checkIndex(int index, int length) { 4828 Preconditions.checkIndex(index, length, Preconditions.SIOOBE_FORMATTER); 4829 } 4830 4831 /* 4832 * StringIndexOutOfBoundsException if {@code offset} 4833 * is negative or greater than {@code length}. 4834 */ 4835 static void checkOffset(int offset, int length) { 4836 Preconditions.checkFromToIndex(offset, length, length, Preconditions.SIOOBE_FORMATTER); 4837 } 4838 4839 /* 4840 * Check {@code offset}, {@code count} against {@code 0} and {@code length} 4841 * bounds. 4842 * 4843 * @return {@code offset} if the sub-range within bounds of the range 4844 * @throws StringIndexOutOfBoundsException 4845 * If {@code offset} is negative, {@code count} is negative, 4846 * or {@code offset} is greater than {@code length - count} 4847 */ 4848 static int checkBoundsOffCount(int offset, int count, int length) { 4849 return Preconditions.checkFromIndexSize(offset, count, length, Preconditions.SIOOBE_FORMATTER); 4850 } 4851 4852 /* 4853 * Check {@code begin}, {@code end} against {@code 0} and {@code length} 4854 * bounds. 4855 * 4856 * @throws StringIndexOutOfBoundsException 4857 * If {@code begin} is negative, {@code begin} is greater than 4858 * {@code end}, or {@code end} is greater than {@code length}. 4859 */ 4860 static void checkBoundsBeginEnd(int begin, int end, int length) { 4861 Preconditions.checkFromToIndex(begin, end, length, Preconditions.SIOOBE_FORMATTER); 4862 } 4863 4864 /** 4865 * Returns the string representation of the {@code codePoint} 4866 * argument. 4867 * 4868 * @param codePoint a {@code codePoint}. 4869 * @return a string of length {@code 1} or {@code 2} containing 4870 * as its single character the argument {@code codePoint}. 4871 * @throws IllegalArgumentException if the specified 4872 * {@code codePoint} is not a {@linkplain Character#isValidCodePoint 4873 * valid Unicode code point}. 4874 */ 4875 static String valueOfCodePoint(int codePoint) { 4876 if (COMPACT_STRINGS && StringLatin1.canEncode(codePoint)) { 4877 return new String(StringLatin1.toBytes((char)codePoint), LATIN1); 4878 } else if (Character.isBmpCodePoint(codePoint)) { 4879 return new String(StringUTF16.toBytes((char)codePoint), UTF16); 4880 } else if (Character.isSupplementaryCodePoint(codePoint)) { 4881 return new String(StringUTF16.toBytesSupplementary(codePoint), UTF16); 4882 } 4883 4884 throw new IllegalArgumentException( 4885 format("Not a valid Unicode code point: 0x%X", codePoint)); 4886 } 4887 4888 /** 4889 * Returns an {@link Optional} containing the nominal descriptor for this 4890 * instance, which is the instance itself. 4891 * 4892 * @return an {@link Optional} describing the {@linkplain String} instance 4893 * @since 12 4894 */ 4895 @Override 4896 public Optional<String> describeConstable() { 4897 return Optional.of(this); 4898 } 4899 4900 /** 4901 * Resolves this instance as a {@link ConstantDesc}, the result of which is 4902 * the instance itself. 4903 * 4904 * @param lookup ignored 4905 * @return the {@linkplain String} instance 4906 * @since 12 4907 */ 4908 @Override 4909 public String resolveConstantDesc(MethodHandles.Lookup lookup) { 4910 return this; 4911 } 4912 4913 }