1 /* 2 * Copyright (c) 1994, 2023, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.lang; 27 28 import java.io.ObjectStreamField; 29 import java.io.UnsupportedEncodingException; 30 import java.lang.annotation.Native; 31 import java.lang.foreign.MemorySegment; 32 import java.lang.foreign.ValueLayout; 33 import java.lang.invoke.MethodHandles; 34 import java.lang.constant.Constable; 35 import java.lang.constant.ConstantDesc; 36 import java.nio.ByteBuffer; 37 import java.nio.CharBuffer; 38 import java.nio.charset.*; 39 import java.util.ArrayList; 40 import java.util.Arrays; 41 import java.util.Comparator; 42 import java.util.Formatter; 43 import java.util.List; 44 import java.util.Locale; 45 import java.util.Objects; 46 import java.util.Optional; 47 import java.util.Spliterator; 48 import java.util.function.Function; 49 import java.util.regex.Pattern; 50 import java.util.regex.PatternSyntaxException; 51 import java.util.stream.Collectors; 52 import java.util.stream.IntStream; 53 import java.util.stream.Stream; 54 import java.util.stream.StreamSupport; 55 56 import jdk.internal.util.ArraysSupport; 57 import jdk.internal.util.Preconditions; 58 import jdk.internal.vm.annotation.ForceInline; 59 import jdk.internal.vm.annotation.IntrinsicCandidate; 60 import jdk.internal.vm.annotation.Stable; 61 import sun.nio.cs.ArrayDecoder; 62 import sun.nio.cs.ArrayEncoder; 63 64 import sun.nio.cs.ISO_8859_1; 65 import sun.nio.cs.US_ASCII; 66 import sun.nio.cs.UTF_8; 67 68 /** 69 * The {@code String} class represents character strings. All 70 * string literals in Java programs, such as {@code "abc"}, are 71 * implemented as instances of this class. 72 * <p> 73 * Strings are constant; their values cannot be changed after they 74 * are created. String buffers support mutable strings. 75 * Because String objects are immutable they can be shared. For example: 76 * <blockquote><pre> 77 * String str = "abc"; 78 * </pre></blockquote><p> 79 * is equivalent to: 80 * <blockquote><pre> 81 * char data[] = {'a', 'b', 'c'}; 82 * String str = new String(data); 83 * </pre></blockquote><p> 84 * Here are some more examples of how strings can be used: 85 * <blockquote><pre> 86 * System.out.println("abc"); 87 * String cde = "cde"; 88 * System.out.println("abc" + cde); 89 * String c = "abc".substring(2, 3); 90 * String d = cde.substring(1, 2); 91 * </pre></blockquote> 92 * <p> 93 * The class {@code String} includes methods for examining 94 * individual characters of the sequence, for comparing strings, for 95 * searching strings, for extracting substrings, and for creating a 96 * copy of a string with all characters translated to uppercase or to 97 * lowercase. Case mapping is based on the Unicode Standard version 98 * specified by the {@link java.lang.Character Character} class. 99 * <p> 100 * The Java language provides special support for the string 101 * concatenation operator ( + ), and for conversion of 102 * other objects to strings. For additional information on string 103 * concatenation and conversion, see <i>The Java Language Specification</i>. 104 * 105 * <p> Unless otherwise noted, passing a {@code null} argument to a constructor 106 * or method in this class will cause a {@link NullPointerException} to be 107 * thrown. 108 * 109 * <p>A {@code String} represents a string in the UTF-16 format 110 * in which <em>supplementary characters</em> are represented by <em>surrogate 111 * pairs</em> (see the section <a href="Character.html#unicode">Unicode 112 * Character Representations</a> in the {@code Character} class for 113 * more information). 114 * Index values refer to {@code char} code units, so a supplementary 115 * character uses two positions in a {@code String}. 116 * <p>The {@code String} class provides methods for dealing with 117 * Unicode code points (i.e., characters), in addition to those for 118 * dealing with Unicode code units (i.e., {@code char} values). 119 * 120 * <p>Unless otherwise noted, methods for comparing Strings do not take locale 121 * into account. The {@link java.text.Collator} class provides methods for 122 * finer-grain, locale-sensitive String comparison. 123 * 124 * @implNote The implementation of the string concatenation operator is left to 125 * the discretion of a Java compiler, as long as the compiler ultimately conforms 126 * to <i>The Java Language Specification</i>. For example, the {@code javac} compiler 127 * may implement the operator with {@code StringBuffer}, {@code StringBuilder}, 128 * or {@code java.lang.invoke.StringConcatFactory} depending on the JDK version. The 129 * implementation of string conversion is typically through the method {@code toString}, 130 * defined by {@code Object} and inherited by all classes in Java. 131 * 132 * @author Lee Boynton 133 * @author Arthur van Hoff 134 * @author Martin Buchholz 135 * @author Ulf Zibis 136 * @see java.lang.Object#toString() 137 * @see java.lang.StringBuffer 138 * @see java.lang.StringBuilder 139 * @see java.nio.charset.Charset 140 * @since 1.0 141 * @jls 15.18.1 String Concatenation Operator + 142 */ 143 144 public final class String 145 implements java.io.Serializable, Comparable<String>, CharSequence, 146 Constable, ConstantDesc { 147 148 /** 149 * The value is used for character storage. 150 * 151 * @implNote This field is trusted by the VM, and is a subject to 152 * constant folding if String instance is constant. Overwriting this 153 * field after construction will cause problems. 154 * 155 * Additionally, it is marked with {@link Stable} to trust the contents 156 * of the array. No other facility in JDK provides this functionality (yet). 157 * {@link Stable} is safe here, because value is never null. 158 */ 159 @Stable 160 private final byte[] value; 161 162 /** 163 * The identifier of the encoding used to encode the bytes in 164 * {@code value}. The supported values in this implementation are 165 * 166 * LATIN1 167 * UTF16 168 * 169 * @implNote This field is trusted by the VM, and is a subject to 170 * constant folding if String instance is constant. Overwriting this 171 * field after construction will cause problems. 172 */ 173 private final byte coder; 174 175 /** Cache the hash code for the string */ 176 private int hash; // Default to 0 177 178 /** 179 * Cache if the hash has been calculated as actually being zero, enabling 180 * us to avoid recalculating this. 181 */ 182 private boolean hashIsZero; // Default to false; 183 184 /** use serialVersionUID from JDK 1.0.2 for interoperability */ 185 @java.io.Serial 186 private static final long serialVersionUID = -6849794470754667710L; 187 188 /** 189 * If String compaction is disabled, the bytes in {@code value} are 190 * always encoded in UTF16. 191 * 192 * For methods with several possible implementation paths, when String 193 * compaction is disabled, only one code path is taken. 194 * 195 * The instance field value is generally opaque to optimizing JIT 196 * compilers. Therefore, in performance-sensitive place, an explicit 197 * check of the static boolean {@code COMPACT_STRINGS} is done first 198 * before checking the {@code coder} field since the static boolean 199 * {@code COMPACT_STRINGS} would be constant folded away by an 200 * optimizing JIT compiler. The idioms for these cases are as follows. 201 * 202 * For code such as: 203 * 204 * if (coder == LATIN1) { ... } 205 * 206 * can be written more optimally as 207 * 208 * if (coder() == LATIN1) { ... } 209 * 210 * or: 211 * 212 * if (COMPACT_STRINGS && coder == LATIN1) { ... } 213 * 214 * An optimizing JIT compiler can fold the above conditional as: 215 * 216 * COMPACT_STRINGS == true => if (coder == LATIN1) { ... } 217 * COMPACT_STRINGS == false => if (false) { ... } 218 * 219 * @implNote 220 * The actual value for this field is injected by JVM. The static 221 * initialization block is used to set the value here to communicate 222 * that this static final field is not statically foldable, and to 223 * avoid any possible circular dependency during vm initialization. 224 */ 225 static final boolean COMPACT_STRINGS; 226 227 static { 228 COMPACT_STRINGS = true; 229 } 230 231 /** 232 * Class String is special cased within the Serialization Stream Protocol. 233 * 234 * A String instance is written into an ObjectOutputStream according to 235 * <a href="{@docRoot}/../specs/serialization/protocol.html#stream-elements"> 236 * <cite>Java Object Serialization Specification</cite>, Section 6.2, "Stream Elements"</a> 237 */ 238 @java.io.Serial 239 private static final ObjectStreamField[] serialPersistentFields = 240 new ObjectStreamField[0]; 241 242 /** 243 * Initializes a newly created {@code String} object so that it represents 244 * an empty character sequence. Note that use of this constructor is 245 * unnecessary since Strings are immutable. 246 */ 247 public String() { 248 this.value = "".value; 249 this.coder = "".coder; 250 } 251 252 /** 253 * Initializes a newly created {@code String} object so that it represents 254 * the same sequence of characters as the argument; in other words, the 255 * newly created string is a copy of the argument string. Unless an 256 * explicit copy of {@code original} is needed, use of this constructor is 257 * unnecessary since Strings are immutable. 258 * 259 * @param original 260 * A {@code String} 261 */ 262 @IntrinsicCandidate 263 public String(String original) { 264 this.value = original.value; 265 this.coder = original.coder; 266 this.hash = original.hash; 267 this.hashIsZero = original.hashIsZero; 268 } 269 270 /** 271 * Allocates a new {@code String} so that it represents the sequence of 272 * characters currently contained in the character array argument. The 273 * contents of the character array are copied; subsequent modification of 274 * the character array does not affect the newly created string. 275 * 276 * @param value 277 * The initial value of the string 278 */ 279 public String(char[] value) { 280 this(value, 0, value.length, null); 281 } 282 283 /** 284 * Allocates a new {@code String} that contains characters from a subarray 285 * of the character array argument. The {@code offset} argument is the 286 * index of the first character of the subarray and the {@code count} 287 * argument specifies the length of the subarray. The contents of the 288 * subarray are copied; subsequent modification of the character array does 289 * not affect the newly created string. 290 * 291 * @param value 292 * Array that is the source of characters 293 * 294 * @param offset 295 * The initial offset 296 * 297 * @param count 298 * The length 299 * 300 * @throws IndexOutOfBoundsException 301 * If {@code offset} is negative, {@code count} is negative, or 302 * {@code offset} is greater than {@code value.length - count} 303 */ 304 public String(char[] value, int offset, int count) { 305 this(value, offset, count, rangeCheck(value, offset, count)); 306 } 307 308 private static Void rangeCheck(char[] value, int offset, int count) { 309 checkBoundsOffCount(offset, count, value.length); 310 return null; 311 } 312 313 /** 314 * Allocates a new {@code String} that contains characters from a subarray 315 * of the <a href="Character.html#unicode">Unicode code point</a> array 316 * argument. The {@code offset} argument is the index of the first code 317 * point of the subarray and the {@code count} argument specifies the 318 * length of the subarray. The contents of the subarray are converted to 319 * {@code char}s; subsequent modification of the {@code int} array does not 320 * affect the newly created string. 321 * 322 * @param codePoints 323 * Array that is the source of Unicode code points 324 * 325 * @param offset 326 * The initial offset 327 * 328 * @param count 329 * The length 330 * 331 * @throws IllegalArgumentException 332 * If any invalid Unicode code point is found in {@code 333 * codePoints} 334 * 335 * @throws IndexOutOfBoundsException 336 * If {@code offset} is negative, {@code count} is negative, or 337 * {@code offset} is greater than {@code codePoints.length - count} 338 * 339 * @since 1.5 340 */ 341 public String(int[] codePoints, int offset, int count) { 342 checkBoundsOffCount(offset, count, codePoints.length); 343 if (count == 0) { 344 this.value = "".value; 345 this.coder = "".coder; 346 return; 347 } 348 if (COMPACT_STRINGS) { 349 byte[] val = StringLatin1.toBytes(codePoints, offset, count); 350 if (val != null) { 351 this.coder = LATIN1; 352 this.value = val; 353 return; 354 } 355 } 356 this.coder = UTF16; 357 this.value = StringUTF16.toBytes(codePoints, offset, count); 358 } 359 360 /** 361 * Allocates a new {@code String} constructed from a subarray of an array 362 * of 8-bit integer values. 363 * 364 * <p> The {@code offset} argument is the index of the first byte of the 365 * subarray, and the {@code count} argument specifies the length of the 366 * subarray. 367 * 368 * <p> Each {@code byte} in the subarray is converted to a {@code char} as 369 * specified in the {@link #String(byte[],int) String(byte[],int)} constructor. 370 * 371 * @deprecated This method does not properly convert bytes into characters. 372 * As of JDK 1.1, the preferred way to do this is via the 373 * {@code String} constructors that take a {@link Charset}, charset name, 374 * or that use the {@link Charset#defaultCharset() default charset}. 375 * 376 * @param ascii 377 * The bytes to be converted to characters 378 * 379 * @param hibyte 380 * The top 8 bits of each 16-bit Unicode code unit 381 * 382 * @param offset 383 * The initial offset 384 * @param count 385 * The length 386 * 387 * @throws IndexOutOfBoundsException 388 * If {@code offset} is negative, {@code count} is negative, or 389 * {@code offset} is greater than {@code ascii.length - count} 390 * 391 * @see #String(byte[], int) 392 * @see #String(byte[], int, int, java.lang.String) 393 * @see #String(byte[], int, int, java.nio.charset.Charset) 394 * @see #String(byte[], int, int) 395 * @see #String(byte[], java.lang.String) 396 * @see #String(byte[], java.nio.charset.Charset) 397 * @see #String(byte[]) 398 */ 399 @Deprecated(since="1.1") 400 public String(byte[] ascii, int hibyte, int offset, int count) { 401 checkBoundsOffCount(offset, count, ascii.length); 402 if (count == 0) { 403 this.value = "".value; 404 this.coder = "".coder; 405 return; 406 } 407 if (COMPACT_STRINGS && (byte)hibyte == 0) { 408 this.value = Arrays.copyOfRange(ascii, offset, offset + count); 409 this.coder = LATIN1; 410 } else { 411 hibyte <<= 8; 412 byte[] val = StringUTF16.newBytesFor(count); 413 for (int i = 0; i < count; i++) { 414 StringUTF16.putChar(val, i, hibyte | (ascii[offset++] & 0xff)); 415 } 416 this.value = val; 417 this.coder = UTF16; 418 } 419 } 420 421 /** 422 * Allocates a new {@code String} containing characters constructed from 423 * an array of 8-bit integer values. Each character <i>c</i> in the 424 * resulting string is constructed from the corresponding component 425 * <i>b</i> in the byte array such that: 426 * 427 * <blockquote><pre> 428 * <b><i>c</i></b> == (char)(((hibyte & 0xff) << 8) 429 * | (<b><i>b</i></b> & 0xff)) 430 * </pre></blockquote> 431 * 432 * @deprecated This method does not properly convert bytes into 433 * characters. As of JDK 1.1, the preferred way to do this is via the 434 * {@code String} constructors that take a {@link Charset}, charset name, 435 * or that use the {@link Charset#defaultCharset() default charset}. 436 * 437 * @param ascii 438 * The bytes to be converted to characters 439 * 440 * @param hibyte 441 * The top 8 bits of each 16-bit Unicode code unit 442 * 443 * @see #String(byte[], int, int, java.lang.String) 444 * @see #String(byte[], int, int, java.nio.charset.Charset) 445 * @see #String(byte[], int, int) 446 * @see #String(byte[], java.lang.String) 447 * @see #String(byte[], java.nio.charset.Charset) 448 * @see #String(byte[]) 449 */ 450 @Deprecated(since="1.1") 451 public String(byte[] ascii, int hibyte) { 452 this(ascii, hibyte, 0, ascii.length); 453 } 454 455 /** 456 * Constructs a new {@code String} by decoding the specified subarray of 457 * bytes using the specified charset. The length of the new {@code String} 458 * is a function of the charset, and hence may not be equal to the length 459 * of the subarray. 460 * 461 * <p> The behavior of this constructor when the given bytes are not valid 462 * in the given charset is unspecified. The {@link 463 * java.nio.charset.CharsetDecoder} class should be used when more control 464 * over the decoding process is required. 465 * 466 * @param bytes 467 * The bytes to be decoded into characters 468 * 469 * @param offset 470 * The index of the first byte to decode 471 * 472 * @param length 473 * The number of bytes to decode 474 * 475 * @param charsetName 476 * The name of a supported {@linkplain java.nio.charset.Charset 477 * charset} 478 * 479 * @throws UnsupportedEncodingException 480 * If the named charset is not supported 481 * 482 * @throws IndexOutOfBoundsException 483 * If {@code offset} is negative, {@code length} is negative, or 484 * {@code offset} is greater than {@code bytes.length - length} 485 * 486 * @since 1.1 487 */ 488 public String(byte[] bytes, int offset, int length, String charsetName) 489 throws UnsupportedEncodingException { 490 this(lookupCharset(charsetName), bytes, checkBoundsOffCount(offset, length, bytes.length), length); 491 } 492 493 /** 494 * Constructs a new {@code String} by decoding the specified subarray of 495 * bytes using the specified {@linkplain java.nio.charset.Charset charset}. 496 * The length of the new {@code String} is a function of the charset, and 497 * hence may not be equal to the length of the subarray. 498 * 499 * <p> This method always replaces malformed-input and unmappable-character 500 * sequences with this charset's default replacement string. The {@link 501 * java.nio.charset.CharsetDecoder} class should be used when more control 502 * over the decoding process is required. 503 * 504 * @param bytes 505 * The bytes to be decoded into characters 506 * 507 * @param offset 508 * The index of the first byte to decode 509 * 510 * @param length 511 * The number of bytes to decode 512 * 513 * @param charset 514 * The {@linkplain java.nio.charset.Charset charset} to be used to 515 * decode the {@code bytes} 516 * 517 * @throws IndexOutOfBoundsException 518 * If {@code offset} is negative, {@code length} is negative, or 519 * {@code offset} is greater than {@code bytes.length - length} 520 * 521 * @since 1.6 522 */ 523 public String(byte[] bytes, int offset, int length, Charset charset) { 524 this(Objects.requireNonNull(charset), bytes, checkBoundsOffCount(offset, length, bytes.length), length); 525 } 526 527 /** 528 * This method does not do any precondition checks on its arguments. 529 * <p> 530 * Important: parameter order of this method is deliberately changed in order to 531 * disambiguate it against other similar methods of this class. 532 */ 533 @SuppressWarnings("removal") 534 private String(Charset charset, byte[] bytes, int offset, int length) { 535 if (length == 0) { 536 this.value = "".value; 537 this.coder = "".coder; 538 } else if (charset == UTF_8.INSTANCE) { 539 if (COMPACT_STRINGS) { 540 int dp = StringCoding.countPositives(bytes, offset, length); 541 if (dp == length) { 542 this.value = Arrays.copyOfRange(bytes, offset, offset + length); 543 this.coder = LATIN1; 544 return; 545 } 546 int sl = offset + length; 547 byte[] dst = new byte[length]; 548 if (dp > 0) { 549 System.arraycopy(bytes, offset, dst, 0, dp); 550 offset += dp; 551 } 552 while (offset < sl) { 553 int b1 = bytes[offset++]; 554 if (b1 >= 0) { 555 dst[dp++] = (byte)b1; 556 continue; 557 } 558 if ((b1 & 0xfe) == 0xc2 && offset < sl) { // b1 either 0xc2 or 0xc3 559 int b2 = bytes[offset]; 560 if (b2 < -64) { // continuation bytes are always negative values in the range -128 to -65 561 dst[dp++] = (byte)decode2(b1, b2); 562 offset++; 563 continue; 564 } 565 } 566 // anything not a latin1, including the REPL 567 // we have to go with the utf16 568 offset--; 569 break; 570 } 571 if (offset == sl) { 572 if (dp != dst.length) { 573 dst = Arrays.copyOf(dst, dp); 574 } 575 this.value = dst; 576 this.coder = LATIN1; 577 return; 578 } 579 byte[] buf = new byte[length << 1]; 580 StringLatin1.inflate(dst, 0, buf, 0, dp); 581 dst = buf; 582 dp = decodeUTF8_UTF16(bytes, offset, sl, dst, dp, true); 583 if (dp != length) { 584 dst = Arrays.copyOf(dst, dp << 1); 585 } 586 this.value = dst; 587 this.coder = UTF16; 588 } else { // !COMPACT_STRINGS 589 byte[] dst = new byte[length << 1]; 590 int dp = decodeUTF8_UTF16(bytes, offset, offset + length, dst, 0, true); 591 if (dp != length) { 592 dst = Arrays.copyOf(dst, dp << 1); 593 } 594 this.value = dst; 595 this.coder = UTF16; 596 } 597 } else if (charset == ISO_8859_1.INSTANCE) { 598 if (COMPACT_STRINGS) { 599 this.value = Arrays.copyOfRange(bytes, offset, offset + length); 600 this.coder = LATIN1; 601 } else { 602 this.value = StringLatin1.inflate(bytes, offset, length); 603 this.coder = UTF16; 604 } 605 } else if (charset == US_ASCII.INSTANCE) { 606 if (COMPACT_STRINGS && !StringCoding.hasNegatives(bytes, offset, length)) { 607 this.value = Arrays.copyOfRange(bytes, offset, offset + length); 608 this.coder = LATIN1; 609 } else { 610 byte[] dst = new byte[length << 1]; 611 int dp = 0; 612 while (dp < length) { 613 int b = bytes[offset++]; 614 StringUTF16.putChar(dst, dp++, (b >= 0) ? (char) b : REPL); 615 } 616 this.value = dst; 617 this.coder = UTF16; 618 } 619 } else { 620 // (1)We never cache the "external" cs, the only benefit of creating 621 // an additional StringDe/Encoder object to wrap it is to share the 622 // de/encode() method. These SD/E objects are short-lived, the young-gen 623 // gc should be able to take care of them well. But the best approach 624 // is still not to generate them if not really necessary. 625 // (2)The defensive copy of the input byte/char[] has a big performance 626 // impact, as well as the outgoing result byte/char[]. Need to do the 627 // optimization check of (sm==null && classLoader0==null) for both. 628 CharsetDecoder cd = charset.newDecoder(); 629 // ArrayDecoder fastpaths 630 if (cd instanceof ArrayDecoder ad) { 631 // ascii 632 if (ad.isASCIICompatible() && !StringCoding.hasNegatives(bytes, offset, length)) { 633 if (COMPACT_STRINGS) { 634 this.value = Arrays.copyOfRange(bytes, offset, offset + length); 635 this.coder = LATIN1; 636 return; 637 } 638 this.value = StringLatin1.inflate(bytes, offset, length); 639 this.coder = UTF16; 640 return; 641 } 642 643 // fastpath for always Latin1 decodable single byte 644 if (COMPACT_STRINGS && ad.isLatin1Decodable()) { 645 byte[] dst = new byte[length]; 646 ad.decodeToLatin1(bytes, offset, length, dst); 647 this.value = dst; 648 this.coder = LATIN1; 649 return; 650 } 651 652 int en = scale(length, cd.maxCharsPerByte()); 653 cd.onMalformedInput(CodingErrorAction.REPLACE) 654 .onUnmappableCharacter(CodingErrorAction.REPLACE); 655 char[] ca = new char[en]; 656 int clen = ad.decode(bytes, offset, length, ca); 657 if (COMPACT_STRINGS) { 658 byte[] bs = StringUTF16.compress(ca, 0, clen); 659 if (bs != null) { 660 value = bs; 661 coder = LATIN1; 662 return; 663 } 664 } 665 coder = UTF16; 666 value = StringUTF16.toBytes(ca, 0, clen); 667 return; 668 } 669 670 // decode using CharsetDecoder 671 int en = scale(length, cd.maxCharsPerByte()); 672 cd.onMalformedInput(CodingErrorAction.REPLACE) 673 .onUnmappableCharacter(CodingErrorAction.REPLACE); 674 char[] ca = new char[en]; 675 if (charset.getClass().getClassLoader0() != null && 676 System.getSecurityManager() != null) { 677 bytes = Arrays.copyOfRange(bytes, offset, offset + length); 678 offset = 0; 679 } 680 681 int caLen; 682 try { 683 caLen = decodeWithDecoder(cd, ca, bytes, offset, length); 684 } catch (CharacterCodingException x) { 685 // Substitution is enabled, so this shouldn't happen 686 throw new Error(x); 687 } 688 if (COMPACT_STRINGS) { 689 byte[] bs = StringUTF16.compress(ca, 0, caLen); 690 if (bs != null) { 691 value = bs; 692 coder = LATIN1; 693 return; 694 } 695 } 696 coder = UTF16; 697 value = StringUTF16.toBytes(ca, 0, caLen); 698 } 699 } 700 701 /* 702 * Throws iae, instead of replacing, if malformed or unmappable. 703 * 704 * @param noShare 705 * {@code true} if the resulting string MUST NOT share the byte array, 706 * {@code false} if the byte array can be exclusively used to construct 707 * the string and is not modified or used for any other purpose. 708 */ 709 static String newStringUTF8NoRepl(byte[] bytes, int offset, int length, boolean noShare) { 710 checkBoundsOffCount(offset, length, bytes.length); 711 if (length == 0) { 712 return ""; 713 } 714 int dp; 715 byte[] dst; 716 if (COMPACT_STRINGS) { 717 dp = StringCoding.countPositives(bytes, offset, length); 718 int sl = offset + length; 719 if (dp == length) { 720 if (noShare || length != bytes.length) { 721 return new String(Arrays.copyOfRange(bytes, offset, offset + length), LATIN1); 722 } else { 723 return new String(bytes, LATIN1); 724 } 725 } 726 dst = new byte[length]; 727 System.arraycopy(bytes, offset, dst, 0, dp); 728 offset += dp; 729 while (offset < sl) { 730 int b1 = bytes[offset++]; 731 if (b1 >= 0) { 732 dst[dp++] = (byte)b1; 733 continue; 734 } 735 if ((b1 & 0xfe) == 0xc2 && offset < sl) { // b1 either 0xc2 or 0xc3 736 int b2 = bytes[offset]; 737 if (b2 < -64) { // continuation bytes are always negative values in the range -128 to -65 738 dst[dp++] = (byte)decode2(b1, b2); 739 offset++; 740 continue; 741 } 742 } 743 // anything not a latin1, including the REPL 744 // we have to go with the utf16 745 offset--; 746 break; 747 } 748 if (offset == sl) { 749 if (dp != dst.length) { 750 dst = Arrays.copyOf(dst, dp); 751 } 752 return new String(dst, LATIN1); 753 } 754 if (dp == 0) { 755 dst = new byte[length << 1]; 756 } else { 757 byte[] buf = new byte[length << 1]; 758 StringLatin1.inflate(dst, 0, buf, 0, dp); 759 dst = buf; 760 } 761 dp = decodeUTF8_UTF16(bytes, offset, sl, dst, dp, false); 762 } else { // !COMPACT_STRINGS 763 dst = new byte[length << 1]; 764 dp = decodeUTF8_UTF16(bytes, offset, offset + length, dst, 0, false); 765 } 766 if (dp != length) { 767 dst = Arrays.copyOf(dst, dp << 1); 768 } 769 return new String(dst, UTF16); 770 } 771 772 static String newStringNoRepl(byte[] src, Charset cs) throws CharacterCodingException { 773 try { 774 return newStringNoRepl1(src, cs); 775 } catch (IllegalArgumentException e) { 776 //newStringNoRepl1 throws IAE with MalformedInputException or CCE as the cause 777 Throwable cause = e.getCause(); 778 if (cause instanceof MalformedInputException mie) { 779 throw mie; 780 } 781 throw (CharacterCodingException)cause; 782 } 783 } 784 785 @SuppressWarnings("removal") 786 private static String newStringNoRepl1(byte[] src, Charset cs) { 787 int len = src.length; 788 if (len == 0) { 789 return ""; 790 } 791 if (cs == UTF_8.INSTANCE) { 792 return newStringUTF8NoRepl(src, 0, src.length, false); 793 } 794 if (cs == ISO_8859_1.INSTANCE) { 795 if (COMPACT_STRINGS) 796 return new String(src, LATIN1); 797 return new String(StringLatin1.inflate(src, 0, src.length), UTF16); 798 } 799 if (cs == US_ASCII.INSTANCE) { 800 if (!StringCoding.hasNegatives(src, 0, src.length)) { 801 if (COMPACT_STRINGS) 802 return new String(src, LATIN1); 803 return new String(StringLatin1.inflate(src, 0, src.length), UTF16); 804 } else { 805 throwMalformed(src); 806 } 807 } 808 809 CharsetDecoder cd = cs.newDecoder(); 810 // ascii fastpath 811 if (cd instanceof ArrayDecoder ad && 812 ad.isASCIICompatible() && 813 !StringCoding.hasNegatives(src, 0, src.length)) { 814 if (COMPACT_STRINGS) 815 return new String(src, LATIN1); 816 return new String(src, 0, src.length, ISO_8859_1.INSTANCE); 817 } 818 int en = scale(len, cd.maxCharsPerByte()); 819 char[] ca = new char[en]; 820 if (cs.getClass().getClassLoader0() != null && 821 System.getSecurityManager() != null) { 822 src = Arrays.copyOf(src, len); 823 } 824 int caLen; 825 try { 826 caLen = decodeWithDecoder(cd, ca, src, 0, src.length); 827 } catch (CharacterCodingException x) { 828 // throw via IAE 829 throw new IllegalArgumentException(x); 830 } 831 if (COMPACT_STRINGS) { 832 byte[] bs = StringUTF16.compress(ca, 0, caLen); 833 if (bs != null) { 834 return new String(bs, LATIN1); 835 } 836 } 837 return new String(StringUTF16.toBytes(ca, 0, caLen), UTF16); 838 } 839 840 private static final char REPL = '\ufffd'; 841 842 // Trim the given byte array to the given length 843 @SuppressWarnings("removal") 844 private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) { 845 if (len == ba.length && (isTrusted || System.getSecurityManager() == null)) { 846 return ba; 847 } else { 848 return Arrays.copyOf(ba, len); 849 } 850 } 851 852 private static int scale(int len, float expansionFactor) { 853 // We need to perform double, not float, arithmetic; otherwise 854 // we lose low order bits when len is larger than 2**24. 855 return (int)(len * (double)expansionFactor); 856 } 857 858 private static Charset lookupCharset(String csn) throws UnsupportedEncodingException { 859 Objects.requireNonNull(csn); 860 try { 861 return Charset.forName(csn); 862 } catch (UnsupportedCharsetException | IllegalCharsetNameException x) { 863 throw new UnsupportedEncodingException(csn); 864 } 865 } 866 867 private static byte[] encode(Charset cs, byte coder, byte[] val) { 868 if (cs == UTF_8.INSTANCE) { 869 return encodeUTF8(coder, val, true); 870 } 871 if (cs == ISO_8859_1.INSTANCE) { 872 return encode8859_1(coder, val); 873 } 874 if (cs == US_ASCII.INSTANCE) { 875 return encodeASCII(coder, val); 876 } 877 return encodeWithEncoder(cs, coder, val, true); 878 } 879 880 private static byte[] encodeWithEncoder(Charset cs, byte coder, byte[] val, boolean doReplace) { 881 CharsetEncoder ce = cs.newEncoder(); 882 int len = val.length >> coder; // assume LATIN1=0/UTF16=1; 883 int en = scale(len, ce.maxBytesPerChar()); 884 // fastpath with ArrayEncoder implies `doReplace`. 885 if (doReplace && ce instanceof ArrayEncoder ae) { 886 // fastpath for ascii compatible 887 if (coder == LATIN1 && 888 ae.isASCIICompatible() && 889 !StringCoding.hasNegatives(val, 0, val.length)) { 890 return val.clone(); 891 } 892 byte[] ba = new byte[en]; 893 if (len == 0) { 894 return ba; 895 } 896 897 int blen = (coder == LATIN1) ? ae.encodeFromLatin1(val, 0, len, ba) 898 : ae.encodeFromUTF16(val, 0, len, ba); 899 if (blen != -1) { 900 return safeTrim(ba, blen, true); 901 } 902 } 903 904 byte[] ba = new byte[en]; 905 if (len == 0) { 906 return ba; 907 } 908 if (doReplace) { 909 ce.onMalformedInput(CodingErrorAction.REPLACE) 910 .onUnmappableCharacter(CodingErrorAction.REPLACE); 911 } 912 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) 913 : StringUTF16.toChars(val); 914 ByteBuffer bb = ByteBuffer.wrap(ba); 915 CharBuffer cb = CharBuffer.wrap(ca, 0, len); 916 try { 917 CoderResult cr = ce.encode(cb, bb, true); 918 if (!cr.isUnderflow()) 919 cr.throwException(); 920 cr = ce.flush(bb); 921 if (!cr.isUnderflow()) 922 cr.throwException(); 923 } catch (CharacterCodingException x) { 924 if (!doReplace) { 925 throw new IllegalArgumentException(x); 926 } else { 927 throw new Error(x); 928 } 929 } 930 return safeTrim(ba, bb.position(), cs.getClass().getClassLoader0() == null); 931 } 932 933 /* 934 * Throws iae, instead of replacing, if unmappable. 935 */ 936 static byte[] getBytesUTF8NoRepl(String s) { 937 return encodeUTF8(s.coder(), s.value(), false); 938 } 939 940 private static boolean isASCII(byte[] src) { 941 return !StringCoding.hasNegatives(src, 0, src.length); 942 } 943 944 /* 945 * Throws CCE, instead of replacing, if unmappable. 946 */ 947 static byte[] getBytesNoRepl(String s, Charset cs) throws CharacterCodingException { 948 try { 949 return getBytesNoRepl1(s, cs); 950 } catch (IllegalArgumentException e) { 951 //getBytesNoRepl1 throws IAE with UnmappableCharacterException or CCE as the cause 952 Throwable cause = e.getCause(); 953 if (cause instanceof UnmappableCharacterException) { 954 throw (UnmappableCharacterException)cause; 955 } 956 throw (CharacterCodingException)cause; 957 } 958 } 959 960 private static byte[] getBytesNoRepl1(String s, Charset cs) { 961 byte[] val = s.value(); 962 byte coder = s.coder(); 963 if (cs == UTF_8.INSTANCE) { 964 if (coder == LATIN1 && isASCII(val)) { 965 return val; 966 } 967 return encodeUTF8(coder, val, false); 968 } 969 if (cs == ISO_8859_1.INSTANCE) { 970 if (coder == LATIN1) { 971 return val; 972 } 973 return encode8859_1(coder, val, false); 974 } 975 if (cs == US_ASCII.INSTANCE) { 976 if (coder == LATIN1) { 977 if (isASCII(val)) { 978 return val; 979 } else { 980 throwUnmappable(val); 981 } 982 } 983 } 984 return encodeWithEncoder(cs, coder, val, false); 985 } 986 987 private static byte[] encodeASCII(byte coder, byte[] val) { 988 if (coder == LATIN1) { 989 int positives = StringCoding.countPositives(val, 0, val.length); 990 byte[] dst = val.clone(); 991 if (positives < dst.length) { 992 replaceNegatives(dst, positives); 993 } 994 return dst; 995 } 996 int len = val.length >> 1; 997 byte[] dst = new byte[len]; 998 int dp = 0; 999 for (int i = 0; i < len; i++) { 1000 char c = StringUTF16.getChar(val, i); 1001 if (c < 0x80) { 1002 dst[dp++] = (byte)c; 1003 continue; 1004 } 1005 if (Character.isHighSurrogate(c) && i + 1 < len && 1006 Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) { 1007 i++; 1008 } 1009 dst[dp++] = '?'; 1010 } 1011 if (len == dp) { 1012 return dst; 1013 } 1014 return Arrays.copyOf(dst, dp); 1015 } 1016 1017 private static void replaceNegatives(byte[] val, int fromIndex) { 1018 for (int i = fromIndex; i < val.length; i++) { 1019 if (val[i] < 0) { 1020 val[i] = '?'; 1021 } 1022 } 1023 } 1024 1025 private static byte[] encode8859_1(byte coder, byte[] val) { 1026 return encode8859_1(coder, val, true); 1027 } 1028 1029 private static byte[] encode8859_1(byte coder, byte[] val, boolean doReplace) { 1030 if (coder == LATIN1) { 1031 return val.clone(); 1032 } 1033 int len = val.length >> 1; 1034 byte[] dst = new byte[len]; 1035 int dp = 0; 1036 int sp = 0; 1037 int sl = len; 1038 while (sp < sl) { 1039 int ret = StringCoding.implEncodeISOArray(val, sp, dst, dp, len); 1040 sp = sp + ret; 1041 dp = dp + ret; 1042 if (ret != len) { 1043 if (!doReplace) { 1044 throwUnmappable(sp); 1045 } 1046 char c = StringUTF16.getChar(val, sp++); 1047 if (Character.isHighSurrogate(c) && sp < sl && 1048 Character.isLowSurrogate(StringUTF16.getChar(val, sp))) { 1049 sp++; 1050 } 1051 dst[dp++] = '?'; 1052 len = sl - sp; 1053 } 1054 } 1055 if (dp == dst.length) { 1056 return dst; 1057 } 1058 return Arrays.copyOf(dst, dp); 1059 } 1060 1061 //////////////////////////////// utf8 //////////////////////////////////// 1062 1063 /** 1064 * Decodes ASCII from the source byte array into the destination 1065 * char array. Used via JavaLangAccess from UTF_8 and other charset 1066 * decoders. 1067 * 1068 * @return the number of bytes successfully decoded, at most len 1069 */ 1070 /* package-private */ 1071 static int decodeASCII(byte[] sa, int sp, char[] da, int dp, int len) { 1072 int count = StringCoding.countPositives(sa, sp, len); 1073 while (count < len) { 1074 if (sa[sp + count] < 0) { 1075 break; 1076 } 1077 count++; 1078 } 1079 StringLatin1.inflate(sa, sp, da, dp, count); 1080 return count; 1081 } 1082 1083 private static boolean isNotContinuation(int b) { 1084 return (b & 0xc0) != 0x80; 1085 } 1086 1087 private static boolean isMalformed3(int b1, int b2, int b3) { 1088 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 1089 (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; 1090 } 1091 1092 private static boolean isMalformed3_2(int b1, int b2) { 1093 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 1094 (b2 & 0xc0) != 0x80; 1095 } 1096 1097 private static boolean isMalformed4(int b2, int b3, int b4) { 1098 return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || 1099 (b4 & 0xc0) != 0x80; 1100 } 1101 1102 private static boolean isMalformed4_2(int b1, int b2) { 1103 return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 1104 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 1105 (b2 & 0xc0) != 0x80; 1106 } 1107 1108 private static boolean isMalformed4_3(int b3) { 1109 return (b3 & 0xc0) != 0x80; 1110 } 1111 1112 private static char decode2(int b1, int b2) { 1113 return (char)(((b1 << 6) ^ b2) ^ 1114 (((byte) 0xC0 << 6) ^ 1115 ((byte) 0x80 << 0))); 1116 } 1117 1118 private static char decode3(int b1, int b2, int b3) { 1119 return (char)((b1 << 12) ^ 1120 (b2 << 6) ^ 1121 (b3 ^ 1122 (((byte) 0xE0 << 12) ^ 1123 ((byte) 0x80 << 6) ^ 1124 ((byte) 0x80 << 0)))); 1125 } 1126 1127 private static int decode4(int b1, int b2, int b3, int b4) { 1128 return ((b1 << 18) ^ 1129 (b2 << 12) ^ 1130 (b3 << 6) ^ 1131 (b4 ^ 1132 (((byte) 0xF0 << 18) ^ 1133 ((byte) 0x80 << 12) ^ 1134 ((byte) 0x80 << 6) ^ 1135 ((byte) 0x80 << 0)))); 1136 } 1137 1138 private static int decodeUTF8_UTF16(byte[] src, int sp, int sl, byte[] dst, int dp, boolean doReplace) { 1139 while (sp < sl) { 1140 int b1 = src[sp++]; 1141 if (b1 >= 0) { 1142 StringUTF16.putChar(dst, dp++, (char) b1); 1143 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { 1144 if (sp < sl) { 1145 int b2 = src[sp++]; 1146 if (isNotContinuation(b2)) { 1147 if (!doReplace) { 1148 throwMalformed(sp - 1, 1); 1149 } 1150 StringUTF16.putChar(dst, dp++, REPL); 1151 sp--; 1152 } else { 1153 StringUTF16.putChar(dst, dp++, decode2(b1, b2)); 1154 } 1155 continue; 1156 } 1157 if (!doReplace) { 1158 throwMalformed(sp, 1); // underflow() 1159 } 1160 StringUTF16.putChar(dst, dp++, REPL); 1161 break; 1162 } else if ((b1 >> 4) == -2) { 1163 if (sp + 1 < sl) { 1164 int b2 = src[sp++]; 1165 int b3 = src[sp++]; 1166 if (isMalformed3(b1, b2, b3)) { 1167 if (!doReplace) { 1168 throwMalformed(sp - 3, 3); 1169 } 1170 StringUTF16.putChar(dst, dp++, REPL); 1171 sp -= 3; 1172 sp += malformed3(src, sp); 1173 } else { 1174 char c = decode3(b1, b2, b3); 1175 if (Character.isSurrogate(c)) { 1176 if (!doReplace) { 1177 throwMalformed(sp - 3, 3); 1178 } 1179 StringUTF16.putChar(dst, dp++, REPL); 1180 } else { 1181 StringUTF16.putChar(dst, dp++, c); 1182 } 1183 } 1184 continue; 1185 } 1186 if (sp < sl && isMalformed3_2(b1, src[sp])) { 1187 if (!doReplace) { 1188 throwMalformed(sp - 1, 2); 1189 } 1190 StringUTF16.putChar(dst, dp++, REPL); 1191 continue; 1192 } 1193 if (!doReplace) { 1194 throwMalformed(sp, 1); 1195 } 1196 StringUTF16.putChar(dst, dp++, REPL); 1197 break; 1198 } else if ((b1 >> 3) == -2) { 1199 if (sp + 2 < sl) { 1200 int b2 = src[sp++]; 1201 int b3 = src[sp++]; 1202 int b4 = src[sp++]; 1203 int uc = decode4(b1, b2, b3, b4); 1204 if (isMalformed4(b2, b3, b4) || 1205 !Character.isSupplementaryCodePoint(uc)) { // shortest form check 1206 if (!doReplace) { 1207 throwMalformed(sp - 4, 4); 1208 } 1209 StringUTF16.putChar(dst, dp++, REPL); 1210 sp -= 4; 1211 sp += malformed4(src, sp); 1212 } else { 1213 StringUTF16.putChar(dst, dp++, Character.highSurrogate(uc)); 1214 StringUTF16.putChar(dst, dp++, Character.lowSurrogate(uc)); 1215 } 1216 continue; 1217 } 1218 b1 &= 0xff; 1219 if (b1 > 0xf4 || sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) { 1220 if (!doReplace) { 1221 throwMalformed(sp - 1, 1); // or 2 1222 } 1223 StringUTF16.putChar(dst, dp++, REPL); 1224 continue; 1225 } 1226 if (!doReplace) { 1227 throwMalformed(sp - 1, 1); 1228 } 1229 sp++; 1230 StringUTF16.putChar(dst, dp++, REPL); 1231 if (sp < sl && isMalformed4_3(src[sp])) { 1232 continue; 1233 } 1234 break; 1235 } else { 1236 if (!doReplace) { 1237 throwMalformed(sp - 1, 1); 1238 } 1239 StringUTF16.putChar(dst, dp++, REPL); 1240 } 1241 } 1242 return dp; 1243 } 1244 1245 private static int decodeWithDecoder(CharsetDecoder cd, char[] dst, byte[] src, int offset, int length) 1246 throws CharacterCodingException { 1247 ByteBuffer bb = ByteBuffer.wrap(src, offset, length); 1248 CharBuffer cb = CharBuffer.wrap(dst, 0, dst.length); 1249 CoderResult cr = cd.decode(bb, cb, true); 1250 if (!cr.isUnderflow()) 1251 cr.throwException(); 1252 cr = cd.flush(cb); 1253 if (!cr.isUnderflow()) 1254 cr.throwException(); 1255 return cb.position(); 1256 } 1257 1258 private static int malformed3(byte[] src, int sp) { 1259 int b1 = src[sp++]; 1260 int b2 = src[sp]; // no need to lookup b3 1261 return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 1262 isNotContinuation(b2)) ? 1 : 2; 1263 } 1264 1265 private static int malformed4(byte[] src, int sp) { 1266 // we don't care the speed here 1267 int b1 = src[sp++] & 0xff; 1268 int b2 = src[sp++] & 0xff; 1269 if (b1 > 0xf4 || 1270 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 1271 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 1272 isNotContinuation(b2)) 1273 return 1; 1274 if (isNotContinuation(src[sp])) 1275 return 2; 1276 return 3; 1277 } 1278 1279 private static void throwMalformed(int off, int nb) { 1280 String msg = "malformed input off : " + off + ", length : " + nb; 1281 throw new IllegalArgumentException(msg, new MalformedInputException(nb)); 1282 } 1283 1284 private static void throwMalformed(byte[] val) { 1285 int dp = StringCoding.countPositives(val, 0, val.length); 1286 throwMalformed(dp, 1); 1287 } 1288 1289 private static void throwUnmappable(int off) { 1290 String msg = "malformed input off : " + off + ", length : 1"; 1291 throw new IllegalArgumentException(msg, new UnmappableCharacterException(1)); 1292 } 1293 1294 private static void throwUnmappable(byte[] val) { 1295 int dp = StringCoding.countPositives(val, 0, val.length); 1296 throwUnmappable(dp); 1297 } 1298 1299 private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) { 1300 if (coder == UTF16) { 1301 return encodeUTF8_UTF16(val, doReplace); 1302 } 1303 1304 if (!StringCoding.hasNegatives(val, 0, val.length)) { 1305 return val.clone(); 1306 } 1307 1308 int dp = 0; 1309 byte[] dst = new byte[val.length << 1]; 1310 for (byte c : val) { 1311 if (c < 0) { 1312 dst[dp++] = (byte) (0xc0 | ((c & 0xff) >> 6)); 1313 dst[dp++] = (byte) (0x80 | (c & 0x3f)); 1314 } else { 1315 dst[dp++] = c; 1316 } 1317 } 1318 if (dp == dst.length) { 1319 return dst; 1320 } 1321 return Arrays.copyOf(dst, dp); 1322 } 1323 1324 private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) { 1325 int dp = 0; 1326 int sp = 0; 1327 int sl = val.length >> 1; 1328 byte[] dst = new byte[sl * 3]; 1329 while (sp < sl) { 1330 // ascii fast loop; 1331 char c = StringUTF16.getChar(val, sp); 1332 if (c >= '\u0080') { 1333 break; 1334 } 1335 dst[dp++] = (byte)c; 1336 sp++; 1337 } 1338 while (sp < sl) { 1339 char c = StringUTF16.getChar(val, sp++); 1340 if (c < 0x80) { 1341 dst[dp++] = (byte)c; 1342 } else if (c < 0x800) { 1343 dst[dp++] = (byte)(0xc0 | (c >> 6)); 1344 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 1345 } else if (Character.isSurrogate(c)) { 1346 int uc = -1; 1347 char c2; 1348 if (Character.isHighSurrogate(c) && sp < sl && 1349 Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { 1350 uc = Character.toCodePoint(c, c2); 1351 } 1352 if (uc < 0) { 1353 if (doReplace) { 1354 dst[dp++] = '?'; 1355 } else { 1356 throwUnmappable(sp - 1); 1357 } 1358 } else { 1359 dst[dp++] = (byte)(0xf0 | ((uc >> 18))); 1360 dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); 1361 dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); 1362 dst[dp++] = (byte)(0x80 | (uc & 0x3f)); 1363 sp++; // 2 chars 1364 } 1365 } else { 1366 // 3 bytes, 16 bits 1367 dst[dp++] = (byte)(0xe0 | ((c >> 12))); 1368 dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); 1369 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 1370 } 1371 } 1372 if (dp == dst.length) { 1373 return dst; 1374 } 1375 return Arrays.copyOf(dst, dp); 1376 } 1377 1378 /** 1379 * Constructs a new {@code String} by decoding the specified array of bytes 1380 * using the specified {@linkplain java.nio.charset.Charset charset}. The 1381 * length of the new {@code String} is a function of the charset, and hence 1382 * may not be equal to the length of the byte array. 1383 * 1384 * <p> The behavior of this constructor when the given bytes are not valid 1385 * in the given charset is unspecified. The {@link 1386 * java.nio.charset.CharsetDecoder} class should be used when more control 1387 * over the decoding process is required. 1388 * 1389 * @param bytes 1390 * The bytes to be decoded into characters 1391 * 1392 * @param charsetName 1393 * The name of a supported {@linkplain java.nio.charset.Charset 1394 * charset} 1395 * 1396 * @throws UnsupportedEncodingException 1397 * If the named charset is not supported 1398 * 1399 * @since 1.1 1400 */ 1401 public String(byte[] bytes, String charsetName) 1402 throws UnsupportedEncodingException { 1403 this(lookupCharset(charsetName), bytes, 0, bytes.length); 1404 } 1405 1406 /** 1407 * Constructs a new {@code String} by decoding the specified array of 1408 * bytes using the specified {@linkplain java.nio.charset.Charset charset}. 1409 * The length of the new {@code String} is a function of the charset, and 1410 * hence may not be equal to the length of the byte array. 1411 * 1412 * <p> This method always replaces malformed-input and unmappable-character 1413 * sequences with this charset's default replacement string. The {@link 1414 * java.nio.charset.CharsetDecoder} class should be used when more control 1415 * over the decoding process is required. 1416 * 1417 * @param bytes 1418 * The bytes to be decoded into characters 1419 * 1420 * @param charset 1421 * The {@linkplain java.nio.charset.Charset charset} to be used to 1422 * decode the {@code bytes} 1423 * 1424 * @since 1.6 1425 */ 1426 public String(byte[] bytes, Charset charset) { 1427 this(Objects.requireNonNull(charset), bytes, 0, bytes.length); 1428 } 1429 1430 /** 1431 * Constructs a new {@code String} by decoding the specified subarray of 1432 * bytes using the {@link Charset#defaultCharset() default charset}. 1433 * The length of the new {@code String} is a function of the charset, 1434 * and hence may not be equal to the length of the subarray. 1435 * 1436 * <p> The behavior of this constructor when the given bytes are not valid 1437 * in the default charset is unspecified. The {@link 1438 * java.nio.charset.CharsetDecoder} class should be used when more control 1439 * over the decoding process is required. 1440 * 1441 * @param bytes 1442 * The bytes to be decoded into characters 1443 * 1444 * @param offset 1445 * The index of the first byte to decode 1446 * 1447 * @param length 1448 * The number of bytes to decode 1449 * 1450 * @throws IndexOutOfBoundsException 1451 * If {@code offset} is negative, {@code length} is negative, or 1452 * {@code offset} is greater than {@code bytes.length - length} 1453 * 1454 * @since 1.1 1455 */ 1456 public String(byte[] bytes, int offset, int length) { 1457 this(Charset.defaultCharset(), bytes, checkBoundsOffCount(offset, length, bytes.length), length); 1458 } 1459 1460 /** 1461 * Constructs a new {@code String} by decoding the specified array of bytes 1462 * using the {@link Charset#defaultCharset() default charset}. The length 1463 * of the new {@code String} is a function of the charset, and hence may not 1464 * be equal to the length of the byte array. 1465 * 1466 * <p> The behavior of this constructor when the given bytes are not valid 1467 * in the default charset is unspecified. The {@link 1468 * java.nio.charset.CharsetDecoder} class should be used when more control 1469 * over the decoding process is required. 1470 * 1471 * @param bytes 1472 * The bytes to be decoded into characters 1473 * 1474 * @since 1.1 1475 */ 1476 public String(byte[] bytes) { 1477 this(Charset.defaultCharset(), bytes, 0, bytes.length); 1478 } 1479 1480 /** 1481 * Allocates a new string that contains the sequence of characters 1482 * currently contained in the string buffer argument. The contents of the 1483 * string buffer are copied; subsequent modification of the string buffer 1484 * does not affect the newly created string. 1485 * 1486 * @param buffer 1487 * A {@code StringBuffer} 1488 */ 1489 public String(StringBuffer buffer) { 1490 this(buffer.toString()); 1491 } 1492 1493 /** 1494 * Allocates a new string that contains the sequence of characters 1495 * currently contained in the string builder argument. The contents of the 1496 * string builder are copied; subsequent modification of the string builder 1497 * does not affect the newly created string. 1498 * 1499 * <p> This constructor is provided to ease migration to {@code 1500 * StringBuilder}. Obtaining a string from a string builder via the {@code 1501 * toString} method is likely to run faster and is generally preferred. 1502 * 1503 * @param builder 1504 * A {@code StringBuilder} 1505 * 1506 * @since 1.5 1507 */ 1508 public String(StringBuilder builder) { 1509 this(builder, null); 1510 } 1511 1512 /** 1513 * Returns the length of this string. 1514 * The length is equal to the number of <a href="Character.html#unicode">Unicode 1515 * code units</a> in the string. 1516 * 1517 * @return the length of the sequence of characters represented by this 1518 * object. 1519 */ 1520 public int length() { 1521 return value.length >> coder(); 1522 } 1523 1524 /** 1525 * Returns {@code true} if, and only if, {@link #length()} is {@code 0}. 1526 * 1527 * @return {@code true} if {@link #length()} is {@code 0}, otherwise 1528 * {@code false} 1529 * 1530 * @since 1.6 1531 */ 1532 @Override 1533 public boolean isEmpty() { 1534 return value.length == 0; 1535 } 1536 1537 /** 1538 * Returns the {@code char} value at the 1539 * specified index. An index ranges from {@code 0} to 1540 * {@code length() - 1}. The first {@code char} value of the sequence 1541 * is at index {@code 0}, the next at index {@code 1}, 1542 * and so on, as for array indexing. 1543 * 1544 * <p>If the {@code char} value specified by the index is a 1545 * <a href="Character.html#unicode">surrogate</a>, the surrogate 1546 * value is returned. 1547 * 1548 * @param index the index of the {@code char} value. 1549 * @return the {@code char} value at the specified index of this string. 1550 * The first {@code char} value is at index {@code 0}. 1551 * @throws IndexOutOfBoundsException if the {@code index} 1552 * argument is negative or not less than the length of this 1553 * string. 1554 */ 1555 public char charAt(int index) { 1556 if (isLatin1()) { 1557 return StringLatin1.charAt(value, index); 1558 } else { 1559 return StringUTF16.charAt(value, index); 1560 } 1561 } 1562 1563 /** 1564 * Returns the character (Unicode code point) at the specified 1565 * index. The index refers to {@code char} values 1566 * (Unicode code units) and ranges from {@code 0} to 1567 * {@link #length()}{@code - 1}. 1568 * 1569 * <p> If the {@code char} value specified at the given index 1570 * is in the high-surrogate range, the following index is less 1571 * than the length of this {@code String}, and the 1572 * {@code char} value at the following index is in the 1573 * low-surrogate range, then the supplementary code point 1574 * corresponding to this surrogate pair is returned. Otherwise, 1575 * the {@code char} value at the given index is returned. 1576 * 1577 * @param index the index to the {@code char} values 1578 * @return the code point value of the character at the 1579 * {@code index} 1580 * @throws IndexOutOfBoundsException if the {@code index} 1581 * argument is negative or not less than the length of this 1582 * string. 1583 * @since 1.5 1584 */ 1585 public int codePointAt(int index) { 1586 if (isLatin1()) { 1587 checkIndex(index, value.length); 1588 return value[index] & 0xff; 1589 } 1590 int length = value.length >> 1; 1591 checkIndex(index, length); 1592 return StringUTF16.codePointAt(value, index, length); 1593 } 1594 1595 /** 1596 * Returns the character (Unicode code point) before the specified 1597 * index. The index refers to {@code char} values 1598 * (Unicode code units) and ranges from {@code 1} to {@link 1599 * CharSequence#length() length}. 1600 * 1601 * <p> If the {@code char} value at {@code (index - 1)} 1602 * is in the low-surrogate range, {@code (index - 2)} is not 1603 * negative, and the {@code char} value at {@code (index - 1604 * 2)} is in the high-surrogate range, then the 1605 * supplementary code point value of the surrogate pair is 1606 * returned. If the {@code char} value at {@code index - 1607 * 1} is an unpaired low-surrogate or a high-surrogate, the 1608 * surrogate value is returned. 1609 * 1610 * @param index the index following the code point that should be returned 1611 * @return the Unicode code point value before the given index. 1612 * @throws IndexOutOfBoundsException if the {@code index} 1613 * argument is less than 1 or greater than the length 1614 * of this string. 1615 * @since 1.5 1616 */ 1617 public int codePointBefore(int index) { 1618 int i = index - 1; 1619 checkIndex(i, length()); 1620 if (isLatin1()) { 1621 return (value[i] & 0xff); 1622 } 1623 return StringUTF16.codePointBefore(value, index); 1624 } 1625 1626 /** 1627 * Returns the number of Unicode code points in the specified text 1628 * range of this {@code String}. The text range begins at the 1629 * specified {@code beginIndex} and extends to the 1630 * {@code char} at index {@code endIndex - 1}. Thus the 1631 * length (in {@code char}s) of the text range is 1632 * {@code endIndex-beginIndex}. Unpaired surrogates within 1633 * the text range count as one code point each. 1634 * 1635 * @param beginIndex the index to the first {@code char} of 1636 * the text range. 1637 * @param endIndex the index after the last {@code char} of 1638 * the text range. 1639 * @return the number of Unicode code points in the specified text 1640 * range 1641 * @throws IndexOutOfBoundsException if the 1642 * {@code beginIndex} is negative, or {@code endIndex} 1643 * is larger than the length of this {@code String}, or 1644 * {@code beginIndex} is larger than {@code endIndex}. 1645 * @since 1.5 1646 */ 1647 public int codePointCount(int beginIndex, int endIndex) { 1648 Objects.checkFromToIndex(beginIndex, endIndex, length()); 1649 if (isLatin1()) { 1650 return endIndex - beginIndex; 1651 } 1652 return StringUTF16.codePointCount(value, beginIndex, endIndex); 1653 } 1654 1655 /** 1656 * Returns the index within this {@code String} that is 1657 * offset from the given {@code index} by 1658 * {@code codePointOffset} code points. Unpaired surrogates 1659 * within the text range given by {@code index} and 1660 * {@code codePointOffset} count as one code point each. 1661 * 1662 * @param index the index to be offset 1663 * @param codePointOffset the offset in code points 1664 * @return the index within this {@code String} 1665 * @throws IndexOutOfBoundsException if {@code index} 1666 * is negative or larger than the length of this 1667 * {@code String}, or if {@code codePointOffset} is positive 1668 * and the substring starting with {@code index} has fewer 1669 * than {@code codePointOffset} code points, 1670 * or if {@code codePointOffset} is negative and the substring 1671 * before {@code index} has fewer than the absolute value 1672 * of {@code codePointOffset} code points. 1673 * @since 1.5 1674 */ 1675 public int offsetByCodePoints(int index, int codePointOffset) { 1676 return Character.offsetByCodePoints(this, index, codePointOffset); 1677 } 1678 1679 /** 1680 * Copies characters from this string into the destination character 1681 * array. 1682 * <p> 1683 * The first character to be copied is at index {@code srcBegin}; 1684 * the last character to be copied is at index {@code srcEnd-1} 1685 * (thus the total number of characters to be copied is 1686 * {@code srcEnd-srcBegin}). The characters are copied into the 1687 * subarray of {@code dst} starting at index {@code dstBegin} 1688 * and ending at index: 1689 * <blockquote><pre> 1690 * dstBegin + (srcEnd-srcBegin) - 1 1691 * </pre></blockquote> 1692 * 1693 * @param srcBegin index of the first character in the string 1694 * to copy. 1695 * @param srcEnd index after the last character in the string 1696 * to copy. 1697 * @param dst the destination array. 1698 * @param dstBegin the start offset in the destination array. 1699 * @throws IndexOutOfBoundsException If any of the following 1700 * is true: 1701 * <ul><li>{@code srcBegin} is negative. 1702 * <li>{@code srcBegin} is greater than {@code srcEnd} 1703 * <li>{@code srcEnd} is greater than the length of this 1704 * string 1705 * <li>{@code dstBegin} is negative 1706 * <li>{@code dstBegin+(srcEnd-srcBegin)} is larger than 1707 * {@code dst.length}</ul> 1708 */ 1709 public void getChars(int srcBegin, int srcEnd, char[] dst, int dstBegin) { 1710 checkBoundsBeginEnd(srcBegin, srcEnd, length()); 1711 checkBoundsOffCount(dstBegin, srcEnd - srcBegin, dst.length); 1712 if (isLatin1()) { 1713 StringLatin1.getChars(value, srcBegin, srcEnd, dst, dstBegin); 1714 } else { 1715 StringUTF16.getChars(value, srcBegin, srcEnd, dst, dstBegin); 1716 } 1717 } 1718 1719 /** 1720 * Copies characters from this string into the destination byte array. Each 1721 * byte receives the 8 low-order bits of the corresponding character. The 1722 * eight high-order bits of each character are not copied and do not 1723 * participate in the transfer in any way. 1724 * 1725 * <p> The first character to be copied is at index {@code srcBegin}; the 1726 * last character to be copied is at index {@code srcEnd-1}. The total 1727 * number of characters to be copied is {@code srcEnd-srcBegin}. The 1728 * characters, converted to bytes, are copied into the subarray of {@code 1729 * dst} starting at index {@code dstBegin} and ending at index: 1730 * 1731 * <blockquote><pre> 1732 * dstBegin + (srcEnd-srcBegin) - 1 1733 * </pre></blockquote> 1734 * 1735 * @deprecated This method does not properly convert characters into 1736 * bytes. As of JDK 1.1, the preferred way to do this is via the 1737 * {@link #getBytes()} method, which uses the {@link Charset#defaultCharset() 1738 * default charset}. 1739 * 1740 * @param srcBegin 1741 * Index of the first character in the string to copy 1742 * 1743 * @param srcEnd 1744 * Index after the last character in the string to copy 1745 * 1746 * @param dst 1747 * The destination array 1748 * 1749 * @param dstBegin 1750 * The start offset in the destination array 1751 * 1752 * @throws IndexOutOfBoundsException 1753 * If any of the following is true: 1754 * <ul> 1755 * <li> {@code srcBegin} is negative 1756 * <li> {@code srcBegin} is greater than {@code srcEnd} 1757 * <li> {@code srcEnd} is greater than the length of this String 1758 * <li> {@code dstBegin} is negative 1759 * <li> {@code dstBegin+(srcEnd-srcBegin)} is larger than {@code 1760 * dst.length} 1761 * </ul> 1762 */ 1763 @Deprecated(since="1.1") 1764 public void getBytes(int srcBegin, int srcEnd, byte[] dst, int dstBegin) { 1765 checkBoundsBeginEnd(srcBegin, srcEnd, length()); 1766 Objects.requireNonNull(dst); 1767 checkBoundsOffCount(dstBegin, srcEnd - srcBegin, dst.length); 1768 if (isLatin1()) { 1769 StringLatin1.getBytes(value, srcBegin, srcEnd, dst, dstBegin); 1770 } else { 1771 StringUTF16.getBytes(value, srcBegin, srcEnd, dst, dstBegin); 1772 } 1773 } 1774 1775 /** 1776 * Encodes this {@code String} into a sequence of bytes using the named 1777 * charset, storing the result into a new byte array. 1778 * 1779 * <p> The behavior of this method when this string cannot be encoded in 1780 * the given charset is unspecified. The {@link 1781 * java.nio.charset.CharsetEncoder} class should be used when more control 1782 * over the encoding process is required. 1783 * 1784 * @param charsetName 1785 * The name of a supported {@linkplain java.nio.charset.Charset 1786 * charset} 1787 * 1788 * @return The resultant byte array 1789 * 1790 * @throws UnsupportedEncodingException 1791 * If the named charset is not supported 1792 * 1793 * @since 1.1 1794 */ 1795 public byte[] getBytes(String charsetName) 1796 throws UnsupportedEncodingException { 1797 return encode(lookupCharset(charsetName), coder(), value); 1798 } 1799 1800 /** 1801 * Encodes this {@code String} into a sequence of bytes using the given 1802 * {@linkplain java.nio.charset.Charset charset}, storing the result into a 1803 * new byte array. 1804 * 1805 * <p> This method always replaces malformed-input and unmappable-character 1806 * sequences with this charset's default replacement byte array. The 1807 * {@link java.nio.charset.CharsetEncoder} class should be used when more 1808 * control over the encoding process is required. 1809 * 1810 * @param charset 1811 * The {@linkplain java.nio.charset.Charset} to be used to encode 1812 * the {@code String} 1813 * 1814 * @return The resultant byte array 1815 * 1816 * @since 1.6 1817 */ 1818 public byte[] getBytes(Charset charset) { 1819 if (charset == null) throw new NullPointerException(); 1820 return encode(charset, coder(), value); 1821 } 1822 1823 /** 1824 * Encodes this {@code String} into a sequence of bytes using the 1825 * {@link Charset#defaultCharset() default charset}, storing the result 1826 * into a new byte array. 1827 * 1828 * <p> The behavior of this method when this string cannot be encoded in 1829 * the default charset is unspecified. The {@link 1830 * java.nio.charset.CharsetEncoder} class should be used when more control 1831 * over the encoding process is required. 1832 * 1833 * @return The resultant byte array 1834 * 1835 * @since 1.1 1836 */ 1837 public byte[] getBytes() { 1838 return encode(Charset.defaultCharset(), coder(), value); 1839 } 1840 1841 boolean bytesCompatible(Charset charset) { 1842 if (isLatin1()) { 1843 if (charset == ISO_8859_1.INSTANCE) { 1844 return true; // ok, same encoding 1845 } else if (charset == UTF_8.INSTANCE || charset == US_ASCII.INSTANCE) { 1846 return !StringCoding.hasNegatives(value, 0, value.length); // ok, if ASCII-compatible 1847 } 1848 } 1849 return false; 1850 } 1851 1852 void copyToSegmentRaw(MemorySegment segment, long offset) { 1853 MemorySegment.copy(value, 0, segment, ValueLayout.JAVA_BYTE, offset, value.length); 1854 } 1855 1856 /** 1857 * Compares this string to the specified object. The result is {@code 1858 * true} if and only if the argument is not {@code null} and is a {@code 1859 * String} object that represents the same sequence of characters as this 1860 * object. 1861 * 1862 * <p>For finer-grained String comparison, refer to 1863 * {@link java.text.Collator}. 1864 * 1865 * @param anObject 1866 * The object to compare this {@code String} against 1867 * 1868 * @return {@code true} if the given object represents a {@code String} 1869 * equivalent to this string, {@code false} otherwise 1870 * 1871 * @see #compareTo(String) 1872 * @see #equalsIgnoreCase(String) 1873 */ 1874 public boolean equals(Object anObject) { 1875 if (this == anObject) { 1876 return true; 1877 } 1878 return (anObject instanceof String aString) 1879 && (!COMPACT_STRINGS || this.coder == aString.coder) 1880 && StringLatin1.equals(value, aString.value); 1881 } 1882 1883 /** 1884 * Compares this string to the specified {@code StringBuffer}. The result 1885 * is {@code true} if and only if this {@code String} represents the same 1886 * sequence of characters as the specified {@code StringBuffer}. This method 1887 * synchronizes on the {@code StringBuffer}. 1888 * 1889 * <p>For finer-grained String comparison, refer to 1890 * {@link java.text.Collator}. 1891 * 1892 * @param sb 1893 * The {@code StringBuffer} to compare this {@code String} against 1894 * 1895 * @return {@code true} if this {@code String} represents the same 1896 * sequence of characters as the specified {@code StringBuffer}, 1897 * {@code false} otherwise 1898 * 1899 * @since 1.4 1900 */ 1901 public boolean contentEquals(StringBuffer sb) { 1902 return contentEquals((CharSequence)sb); 1903 } 1904 1905 private boolean nonSyncContentEquals(AbstractStringBuilder sb) { 1906 int len = length(); 1907 if (len != sb.length()) { 1908 return false; 1909 } 1910 byte[] v1 = value; 1911 byte[] v2 = sb.getValue(); 1912 byte coder = coder(); 1913 if (coder == sb.getCoder()) { 1914 return v1.length <= v2.length && ArraysSupport.mismatch(v1, v2, v1.length) < 0; 1915 } else { 1916 if (coder != LATIN1) { // utf16 str and latin1 abs can never be "equal" 1917 return false; 1918 } 1919 return StringUTF16.contentEquals(v1, v2, len); 1920 } 1921 } 1922 1923 /** 1924 * Compares this string to the specified {@code CharSequence}. The 1925 * result is {@code true} if and only if this {@code String} represents the 1926 * same sequence of char values as the specified sequence. Note that if the 1927 * {@code CharSequence} is a {@code StringBuffer} then the method 1928 * synchronizes on it. 1929 * 1930 * <p>For finer-grained String comparison, refer to 1931 * {@link java.text.Collator}. 1932 * 1933 * @param cs 1934 * The sequence to compare this {@code String} against 1935 * 1936 * @return {@code true} if this {@code String} represents the same 1937 * sequence of char values as the specified sequence, {@code 1938 * false} otherwise 1939 * 1940 * @since 1.5 1941 */ 1942 public boolean contentEquals(CharSequence cs) { 1943 // Argument is a StringBuffer, StringBuilder 1944 if (cs instanceof AbstractStringBuilder) { 1945 if (cs instanceof StringBuffer) { 1946 synchronized(cs) { 1947 return nonSyncContentEquals((AbstractStringBuilder)cs); 1948 } 1949 } else { 1950 return nonSyncContentEquals((AbstractStringBuilder)cs); 1951 } 1952 } 1953 // Argument is a String 1954 if (cs instanceof String) { 1955 return equals(cs); 1956 } 1957 // Argument is a generic CharSequence 1958 int n = cs.length(); 1959 if (n != length()) { 1960 return false; 1961 } 1962 byte[] val = this.value; 1963 if (isLatin1()) { 1964 for (int i = 0; i < n; i++) { 1965 if ((val[i] & 0xff) != cs.charAt(i)) { 1966 return false; 1967 } 1968 } 1969 } else { 1970 if (!StringUTF16.contentEquals(val, cs, n)) { 1971 return false; 1972 } 1973 } 1974 return true; 1975 } 1976 1977 /** 1978 * Compares this {@code String} to another {@code String}, ignoring case 1979 * considerations. Two strings are considered equal ignoring case if they 1980 * are of the same length and corresponding Unicode code points in the two 1981 * strings are equal ignoring case. 1982 * 1983 * <p> Two Unicode code points are considered the same 1984 * ignoring case if at least one of the following is true: 1985 * <ul> 1986 * <li> The two Unicode code points are the same (as compared by the 1987 * {@code ==} operator) 1988 * <li> Calling {@code Character.toLowerCase(Character.toUpperCase(int))} 1989 * on each Unicode code point produces the same result 1990 * </ul> 1991 * 1992 * <p>Note that this method does <em>not</em> take locale into account, and 1993 * will result in unsatisfactory results for certain locales. The 1994 * {@link java.text.Collator} class provides locale-sensitive comparison. 1995 * 1996 * @param anotherString 1997 * The {@code String} to compare this {@code String} against 1998 * 1999 * @return {@code true} if the argument is not {@code null} and it 2000 * represents an equivalent {@code String} ignoring case; {@code 2001 * false} otherwise 2002 * 2003 * @see #equals(Object) 2004 * @see #codePoints() 2005 */ 2006 public boolean equalsIgnoreCase(String anotherString) { 2007 return (this == anotherString) ? true 2008 : (anotherString != null) 2009 && (anotherString.length() == length()) 2010 && regionMatches(true, 0, anotherString, 0, length()); 2011 } 2012 2013 /** 2014 * Compares two strings lexicographically. 2015 * The comparison is based on the Unicode value of each character in 2016 * the strings. The character sequence represented by this 2017 * {@code String} object is compared lexicographically to the 2018 * character sequence represented by the argument string. The result is 2019 * a negative integer if this {@code String} object 2020 * lexicographically precedes the argument string. The result is a 2021 * positive integer if this {@code String} object lexicographically 2022 * follows the argument string. The result is zero if the strings 2023 * are equal; {@code compareTo} returns {@code 0} exactly when 2024 * the {@link #equals(Object)} method would return {@code true}. 2025 * <p> 2026 * This is the definition of lexicographic ordering. If two strings are 2027 * different, then either they have different characters at some index 2028 * that is a valid index for both strings, or their lengths are different, 2029 * or both. If they have different characters at one or more index 2030 * positions, let <i>k</i> be the smallest such index; then the string 2031 * whose character at position <i>k</i> has the smaller value, as 2032 * determined by using the {@code <} operator, lexicographically precedes the 2033 * other string. In this case, {@code compareTo} returns the 2034 * difference of the two character values at position {@code k} in 2035 * the two string -- that is, the value: 2036 * <blockquote><pre> 2037 * this.charAt(k)-anotherString.charAt(k) 2038 * </pre></blockquote> 2039 * If there is no index position at which they differ, then the shorter 2040 * string lexicographically precedes the longer string. In this case, 2041 * {@code compareTo} returns the difference of the lengths of the 2042 * strings -- that is, the value: 2043 * <blockquote><pre> 2044 * this.length()-anotherString.length() 2045 * </pre></blockquote> 2046 * 2047 * <p>For finer-grained String comparison, refer to 2048 * {@link java.text.Collator}. 2049 * 2050 * @param anotherString the {@code String} to be compared. 2051 * @return the value {@code 0} if the argument string is equal to 2052 * this string; a value less than {@code 0} if this string 2053 * is lexicographically less than the string argument; and a 2054 * value greater than {@code 0} if this string is 2055 * lexicographically greater than the string argument. 2056 */ 2057 public int compareTo(String anotherString) { 2058 byte[] v1 = value; 2059 byte[] v2 = anotherString.value; 2060 byte coder = coder(); 2061 if (coder == anotherString.coder()) { 2062 return coder == LATIN1 ? StringLatin1.compareTo(v1, v2) 2063 : StringUTF16.compareTo(v1, v2); 2064 } 2065 return coder == LATIN1 ? StringLatin1.compareToUTF16(v1, v2) 2066 : StringUTF16.compareToLatin1(v1, v2); 2067 } 2068 2069 /** 2070 * A Comparator that orders {@code String} objects as by 2071 * {@link #compareToIgnoreCase(String) compareToIgnoreCase}. 2072 * This comparator is serializable. 2073 * <p> 2074 * Note that this Comparator does <em>not</em> take locale into account, 2075 * and will result in an unsatisfactory ordering for certain locales. 2076 * The {@link java.text.Collator} class provides locale-sensitive comparison. 2077 * 2078 * @see java.text.Collator 2079 * @since 1.2 2080 */ 2081 public static final Comparator<String> CASE_INSENSITIVE_ORDER 2082 = new CaseInsensitiveComparator(); 2083 2084 /** 2085 * CaseInsensitiveComparator for Strings. 2086 */ 2087 private static class CaseInsensitiveComparator 2088 implements Comparator<String>, java.io.Serializable { 2089 // use serialVersionUID from JDK 1.2.2 for interoperability 2090 @java.io.Serial 2091 private static final long serialVersionUID = 8575799808933029326L; 2092 2093 public int compare(String s1, String s2) { 2094 byte[] v1 = s1.value; 2095 byte[] v2 = s2.value; 2096 byte coder = s1.coder(); 2097 if (coder == s2.coder()) { 2098 return coder == LATIN1 ? StringLatin1.compareToCI(v1, v2) 2099 : StringUTF16.compareToCI(v1, v2); 2100 } 2101 return coder == LATIN1 ? StringLatin1.compareToCI_UTF16(v1, v2) 2102 : StringUTF16.compareToCI_Latin1(v1, v2); 2103 } 2104 2105 /** Replaces the de-serialized object. */ 2106 @java.io.Serial 2107 private Object readResolve() { return CASE_INSENSITIVE_ORDER; } 2108 } 2109 2110 /** 2111 * Compares two strings lexicographically, ignoring case 2112 * differences. This method returns an integer whose sign is that of 2113 * calling {@code compareTo} with case folded versions of the strings 2114 * where case differences have been eliminated by calling 2115 * {@code Character.toLowerCase(Character.toUpperCase(int))} on 2116 * each Unicode code point. 2117 * <p> 2118 * Note that this method does <em>not</em> take locale into account, 2119 * and will result in an unsatisfactory ordering for certain locales. 2120 * The {@link java.text.Collator} class provides locale-sensitive comparison. 2121 * 2122 * @param str the {@code String} to be compared. 2123 * @return a negative integer, zero, or a positive integer as the 2124 * specified String is greater than, equal to, or less 2125 * than this String, ignoring case considerations. 2126 * @see java.text.Collator 2127 * @see #codePoints() 2128 * @since 1.2 2129 */ 2130 public int compareToIgnoreCase(String str) { 2131 return CASE_INSENSITIVE_ORDER.compare(this, str); 2132 } 2133 2134 /** 2135 * Tests if two string regions are equal. 2136 * <p> 2137 * A substring of this {@code String} object is compared to a substring 2138 * of the argument other. The result is true if these substrings 2139 * represent identical character sequences. The substring of this 2140 * {@code String} object to be compared begins at index {@code toffset} 2141 * and has length {@code len}. The substring of other to be compared 2142 * begins at index {@code ooffset} and has length {@code len}. The 2143 * result is {@code false} if and only if at least one of the following 2144 * is true: 2145 * <ul><li>{@code toffset} is negative. 2146 * <li>{@code ooffset} is negative. 2147 * <li>{@code toffset+len} is greater than the length of this 2148 * {@code String} object. 2149 * <li>{@code ooffset+len} is greater than the length of the other 2150 * argument. 2151 * <li>There is some nonnegative integer <i>k</i> less than {@code len} 2152 * such that: 2153 * {@code this.charAt(toffset + }<i>k</i>{@code ) != other.charAt(ooffset + } 2154 * <i>k</i>{@code )} 2155 * </ul> 2156 * 2157 * <p>Note that this method does <em>not</em> take locale into account. The 2158 * {@link java.text.Collator} class provides locale-sensitive comparison. 2159 * 2160 * @param toffset the starting offset of the subregion in this string. 2161 * @param other the string argument. 2162 * @param ooffset the starting offset of the subregion in the string 2163 * argument. 2164 * @param len the number of characters to compare. 2165 * @return {@code true} if the specified subregion of this string 2166 * exactly matches the specified subregion of the string argument; 2167 * {@code false} otherwise. 2168 */ 2169 public boolean regionMatches(int toffset, String other, int ooffset, int len) { 2170 // Note: toffset, ooffset, or len might be near -1>>>1. 2171 if ((ooffset < 0) || (toffset < 0) || 2172 (toffset > (long)length() - len) || 2173 (ooffset > (long)other.length() - len)) { 2174 return false; 2175 } 2176 byte[] tv = value; 2177 byte[] ov = other.value; 2178 byte coder = coder(); 2179 if (coder == other.coder()) { 2180 if (coder == UTF16) { 2181 toffset <<= UTF16; 2182 ooffset <<= UTF16; 2183 len <<= UTF16; 2184 } 2185 return ArraysSupport.mismatch(tv, toffset, 2186 ov, ooffset, len) < 0; 2187 } else { 2188 if (coder == LATIN1) { 2189 while (len-- > 0) { 2190 if (StringLatin1.getChar(tv, toffset++) != 2191 StringUTF16.getChar(ov, ooffset++)) { 2192 return false; 2193 } 2194 } 2195 } else { 2196 while (len-- > 0) { 2197 if (StringUTF16.getChar(tv, toffset++) != 2198 StringLatin1.getChar(ov, ooffset++)) { 2199 return false; 2200 } 2201 } 2202 } 2203 } 2204 return true; 2205 } 2206 2207 /** 2208 * Tests if two string regions are equal. 2209 * <p> 2210 * A substring of this {@code String} object is compared to a substring 2211 * of the argument {@code other}. The result is {@code true} if these 2212 * substrings represent Unicode code point sequences that are the same, 2213 * ignoring case if and only if {@code ignoreCase} is true. 2214 * The sequences {@code tsequence} and {@code osequence} are compared, 2215 * where {@code tsequence} is the sequence produced as if by calling 2216 * {@code this.substring(toffset, toffset + len).codePoints()} and 2217 * {@code osequence} is the sequence produced as if by calling 2218 * {@code other.substring(ooffset, ooffset + len).codePoints()}. 2219 * The result is {@code true} if and only if all of the following 2220 * are true: 2221 * <ul><li>{@code toffset} is non-negative. 2222 * <li>{@code ooffset} is non-negative. 2223 * <li>{@code toffset+len} is less than or equal to the length of this 2224 * {@code String} object. 2225 * <li>{@code ooffset+len} is less than or equal to the length of the other 2226 * argument. 2227 * <li>if {@code ignoreCase} is {@code false}, all pairs of corresponding Unicode 2228 * code points are equal integer values; or if {@code ignoreCase} is {@code true}, 2229 * {@link Character#toLowerCase(int) Character.toLowerCase(} 2230 * {@link Character#toUpperCase(int)}{@code )} on all pairs of Unicode code points 2231 * results in equal integer values. 2232 * </ul> 2233 * 2234 * <p>Note that this method does <em>not</em> take locale into account, 2235 * and will result in unsatisfactory results for certain locales when 2236 * {@code ignoreCase} is {@code true}. The {@link java.text.Collator} class 2237 * provides locale-sensitive comparison. 2238 * 2239 * @param ignoreCase if {@code true}, ignore case when comparing 2240 * characters. 2241 * @param toffset the starting offset of the subregion in this 2242 * string. 2243 * @param other the string argument. 2244 * @param ooffset the starting offset of the subregion in the string 2245 * argument. 2246 * @param len the number of characters (Unicode code units - 2247 * 16bit {@code char} value) to compare. 2248 * @return {@code true} if the specified subregion of this string 2249 * matches the specified subregion of the string argument; 2250 * {@code false} otherwise. Whether the matching is exact 2251 * or case insensitive depends on the {@code ignoreCase} 2252 * argument. 2253 * @see #codePoints() 2254 */ 2255 public boolean regionMatches(boolean ignoreCase, int toffset, 2256 String other, int ooffset, int len) { 2257 if (!ignoreCase) { 2258 return regionMatches(toffset, other, ooffset, len); 2259 } 2260 // Note: toffset, ooffset, or len might be near -1>>>1. 2261 if ((ooffset < 0) || (toffset < 0) 2262 || (toffset > (long)length() - len) 2263 || (ooffset > (long)other.length() - len)) { 2264 return false; 2265 } 2266 byte[] tv = value; 2267 byte[] ov = other.value; 2268 byte coder = coder(); 2269 if (coder == other.coder()) { 2270 return coder == LATIN1 2271 ? StringLatin1.regionMatchesCI(tv, toffset, ov, ooffset, len) 2272 : StringUTF16.regionMatchesCI(tv, toffset, ov, ooffset, len); 2273 } 2274 return coder == LATIN1 2275 ? StringLatin1.regionMatchesCI_UTF16(tv, toffset, ov, ooffset, len) 2276 : StringUTF16.regionMatchesCI_Latin1(tv, toffset, ov, ooffset, len); 2277 } 2278 2279 /** 2280 * Tests if the substring of this string beginning at the 2281 * specified index starts with the specified prefix. 2282 * 2283 * @param prefix the prefix. 2284 * @param toffset where to begin looking in this string. 2285 * @return {@code true} if the character sequence represented by the 2286 * argument is a prefix of the substring of this object starting 2287 * at index {@code toffset}; {@code false} otherwise. 2288 * The result is {@code false} if {@code toffset} is 2289 * negative or greater than the length of this 2290 * {@code String} object; otherwise the result is the same 2291 * as the result of the expression 2292 * <pre> 2293 * this.substring(toffset).startsWith(prefix) 2294 * </pre> 2295 */ 2296 public boolean startsWith(String prefix, int toffset) { 2297 // Note: toffset might be near -1>>>1. 2298 if (toffset < 0 || toffset > length() - prefix.length()) { 2299 return false; 2300 } 2301 byte[] ta = value; 2302 byte[] pa = prefix.value; 2303 int po = 0; 2304 int pc = pa.length; 2305 byte coder = coder(); 2306 if (coder == prefix.coder()) { 2307 if (coder == UTF16) { 2308 toffset <<= UTF16; 2309 } 2310 return ArraysSupport.mismatch(ta, toffset, 2311 pa, 0, pc) < 0; 2312 } else { 2313 if (coder == LATIN1) { // && pcoder == UTF16 2314 return false; 2315 } 2316 // coder == UTF16 && pcoder == LATIN1) 2317 while (po < pc) { 2318 if (StringUTF16.getChar(ta, toffset++) != (pa[po++] & 0xff)) { 2319 return false; 2320 } 2321 } 2322 } 2323 return true; 2324 } 2325 2326 /** 2327 * Tests if this string starts with the specified prefix. 2328 * 2329 * @param prefix the prefix. 2330 * @return {@code true} if the character sequence represented by the 2331 * argument is a prefix of the character sequence represented by 2332 * this string; {@code false} otherwise. 2333 * Note also that {@code true} will be returned if the 2334 * argument is an empty string or is equal to this 2335 * {@code String} object as determined by the 2336 * {@link #equals(Object)} method. 2337 * @since 1.0 2338 */ 2339 public boolean startsWith(String prefix) { 2340 return startsWith(prefix, 0); 2341 } 2342 2343 /** 2344 * Tests if this string ends with the specified suffix. 2345 * 2346 * @param suffix the suffix. 2347 * @return {@code true} if the character sequence represented by the 2348 * argument is a suffix of the character sequence represented by 2349 * this object; {@code false} otherwise. Note that the 2350 * result will be {@code true} if the argument is the 2351 * empty string or is equal to this {@code String} object 2352 * as determined by the {@link #equals(Object)} method. 2353 */ 2354 public boolean endsWith(String suffix) { 2355 return startsWith(suffix, length() - suffix.length()); 2356 } 2357 2358 /** 2359 * Returns a hash code for this string. The hash code for a 2360 * {@code String} object is computed as 2361 * <blockquote><pre> 2362 * s[0]*31^(n-1) + s[1]*31^(n-2) + ... + s[n-1] 2363 * </pre></blockquote> 2364 * using {@code int} arithmetic, where {@code s[i]} is the 2365 * <i>i</i>th character of the string, {@code n} is the length of 2366 * the string, and {@code ^} indicates exponentiation. 2367 * (The hash value of the empty string is zero.) 2368 * 2369 * @return a hash code value for this object. 2370 */ 2371 public int hashCode() { 2372 // The hash or hashIsZero fields are subject to a benign data race, 2373 // making it crucial to ensure that any observable result of the 2374 // calculation in this method stays correct under any possible read of 2375 // these fields. Necessary restrictions to allow this to be correct 2376 // without explicit memory fences or similar concurrency primitives is 2377 // that we can ever only write to one of these two fields for a given 2378 // String instance, and that the computation is idempotent and derived 2379 // from immutable state 2380 int h = hash; 2381 if (h == 0 && !hashIsZero) { 2382 h = isLatin1() ? StringLatin1.hashCode(value) 2383 : StringUTF16.hashCode(value); 2384 if (h == 0) { 2385 hashIsZero = true; 2386 } else { 2387 hash = h; 2388 } 2389 } 2390 return h; 2391 } 2392 2393 /** 2394 * Returns the index within this string of the first occurrence of 2395 * the specified character. If a character with value 2396 * {@code ch} occurs in the character sequence represented by 2397 * this {@code String} object, then the index (in Unicode 2398 * code units) of the first such occurrence is returned. For 2399 * values of {@code ch} in the range from 0 to 0xFFFF 2400 * (inclusive), this is the smallest value <i>k</i> such that: 2401 * <blockquote><pre> 2402 * this.charAt(<i>k</i>) == ch 2403 * </pre></blockquote> 2404 * is true. For other values of {@code ch}, it is the 2405 * smallest value <i>k</i> such that: 2406 * <blockquote><pre> 2407 * this.codePointAt(<i>k</i>) == ch 2408 * </pre></blockquote> 2409 * is true. In either case, if no such character occurs in this 2410 * string, then {@code -1} is returned. 2411 * 2412 * @param ch a character (Unicode code point). 2413 * @return the index of the first occurrence of the character in the 2414 * character sequence represented by this object, or 2415 * {@code -1} if the character does not occur. 2416 */ 2417 public int indexOf(int ch) { 2418 return indexOf(ch, 0); 2419 } 2420 2421 /** 2422 * Returns the index within this string of the first occurrence of the 2423 * specified character, starting the search at the specified index. 2424 * <p> 2425 * If a character with value {@code ch} occurs in the 2426 * character sequence represented by this {@code String} 2427 * object at an index no smaller than {@code fromIndex}, then 2428 * the index of the first such occurrence is returned. For values 2429 * of {@code ch} in the range from 0 to 0xFFFF (inclusive), 2430 * this is the smallest value <i>k</i> such that: 2431 * <blockquote><pre> 2432 * (this.charAt(<i>k</i>) == ch) {@code &&} (<i>k</i> >= fromIndex) 2433 * </pre></blockquote> 2434 * is true. For other values of {@code ch}, it is the 2435 * smallest value <i>k</i> such that: 2436 * <blockquote><pre> 2437 * (this.codePointAt(<i>k</i>) == ch) {@code &&} (<i>k</i> >= fromIndex) 2438 * </pre></blockquote> 2439 * is true. In either case, if no such character occurs in this 2440 * string at or after position {@code fromIndex}, then 2441 * {@code -1} is returned. 2442 * 2443 * <p> 2444 * There is no restriction on the value of {@code fromIndex}. If it 2445 * is negative, it has the same effect as if it were zero: this entire 2446 * string may be searched. If it is greater than the length of this 2447 * string, it has the same effect as if it were equal to the length of 2448 * this string: {@code -1} is returned. 2449 * 2450 * <p>All indices are specified in {@code char} values 2451 * (Unicode code units). 2452 * 2453 * @param ch a character (Unicode code point). 2454 * @param fromIndex the index to start the search from. 2455 * @return the index of the first occurrence of the character in the 2456 * character sequence represented by this object that is greater 2457 * than or equal to {@code fromIndex}, or {@code -1} 2458 * if the character does not occur. 2459 * 2460 * @apiNote 2461 * Unlike {@link #substring(int)}, for example, this method does not throw 2462 * an exception when {@code fromIndex} is outside the valid range. 2463 * Rather, it returns -1 when {@code fromIndex} is larger than the length of 2464 * the string. 2465 * This result is, by itself, indistinguishable from a genuine absence of 2466 * {@code ch} in the string. 2467 * If stricter behavior is needed, {@link #indexOf(int, int, int)} 2468 * should be considered instead. 2469 * On a {@link String} {@code s}, for example, 2470 * {@code s.indexOf(ch, fromIndex, s.length())} would throw if 2471 * {@code fromIndex} were larger than the string length, or were negative. 2472 */ 2473 public int indexOf(int ch, int fromIndex) { 2474 return isLatin1() ? StringLatin1.indexOf(value, ch, fromIndex, length()) 2475 : StringUTF16.indexOf(value, ch, fromIndex, length()); 2476 } 2477 2478 /** 2479 * Returns the index within this string of the first occurrence of the 2480 * specified character, starting the search at {@code beginIndex} and 2481 * stopping before {@code endIndex}. 2482 * 2483 * <p>If a character with value {@code ch} occurs in the 2484 * character sequence represented by this {@code String} 2485 * object at an index no smaller than {@code beginIndex} but smaller than 2486 * {@code endIndex}, then 2487 * the index of the first such occurrence is returned. For values 2488 * of {@code ch} in the range from 0 to 0xFFFF (inclusive), 2489 * this is the smallest value <i>k</i> such that: 2490 * <blockquote><pre> 2491 * (this.charAt(<i>k</i>) == ch) && (beginIndex <= <i>k</i> < endIndex) 2492 * </pre></blockquote> 2493 * is true. For other values of {@code ch}, it is the 2494 * smallest value <i>k</i> such that: 2495 * <blockquote><pre> 2496 * (this.codePointAt(<i>k</i>) == ch) && (beginIndex <= <i>k</i> < endIndex) 2497 * </pre></blockquote> 2498 * is true. In either case, if no such character occurs in this 2499 * string at or after position {@code beginIndex} and before position 2500 * {@code endIndex}, then {@code -1} is returned. 2501 * 2502 * <p>All indices are specified in {@code char} values 2503 * (Unicode code units). 2504 * 2505 * @param ch a character (Unicode code point). 2506 * @param beginIndex the index to start the search from (included). 2507 * @param endIndex the index to stop the search at (excluded). 2508 * @return the index of the first occurrence of the character in the 2509 * character sequence represented by this object that is greater 2510 * than or equal to {@code beginIndex} and less than {@code endIndex}, 2511 * or {@code -1} if the character does not occur. 2512 * @throws StringIndexOutOfBoundsException if {@code beginIndex} 2513 * is negative, or {@code endIndex} is larger than the length of 2514 * this {@code String} object, or {@code beginIndex} is larger than 2515 * {@code endIndex}. 2516 * @since 21 2517 */ 2518 public int indexOf(int ch, int beginIndex, int endIndex) { 2519 checkBoundsBeginEnd(beginIndex, endIndex, length()); 2520 return isLatin1() ? StringLatin1.indexOf(value, ch, beginIndex, endIndex) 2521 : StringUTF16.indexOf(value, ch, beginIndex, endIndex); 2522 } 2523 2524 /** 2525 * Returns the index within this string of the last occurrence of 2526 * the specified character. For values of {@code ch} in the 2527 * range from 0 to 0xFFFF (inclusive), the index (in Unicode code 2528 * units) returned is the largest value <i>k</i> such that: 2529 * <blockquote><pre> 2530 * this.charAt(<i>k</i>) == ch 2531 * </pre></blockquote> 2532 * is true. For other values of {@code ch}, it is the 2533 * largest value <i>k</i> such that: 2534 * <blockquote><pre> 2535 * this.codePointAt(<i>k</i>) == ch 2536 * </pre></blockquote> 2537 * is true. In either case, if no such character occurs in this 2538 * string, then {@code -1} is returned. The 2539 * {@code String} is searched backwards starting at the last 2540 * character. 2541 * 2542 * @param ch a character (Unicode code point). 2543 * @return the index of the last occurrence of the character in the 2544 * character sequence represented by this object, or 2545 * {@code -1} if the character does not occur. 2546 */ 2547 public int lastIndexOf(int ch) { 2548 return lastIndexOf(ch, length() - 1); 2549 } 2550 2551 /** 2552 * Returns the index within this string of the last occurrence of 2553 * the specified character, searching backward starting at the 2554 * specified index. For values of {@code ch} in the range 2555 * from 0 to 0xFFFF (inclusive), the index returned is the largest 2556 * value <i>k</i> such that: 2557 * <blockquote><pre> 2558 * (this.charAt(<i>k</i>) == ch) {@code &&} (<i>k</i> <= fromIndex) 2559 * </pre></blockquote> 2560 * is true. For other values of {@code ch}, it is the 2561 * largest value <i>k</i> such that: 2562 * <blockquote><pre> 2563 * (this.codePointAt(<i>k</i>) == ch) {@code &&} (<i>k</i> <= fromIndex) 2564 * </pre></blockquote> 2565 * is true. In either case, if no such character occurs in this 2566 * string at or before position {@code fromIndex}, then 2567 * {@code -1} is returned. 2568 * 2569 * <p>All indices are specified in {@code char} values 2570 * (Unicode code units). 2571 * 2572 * @param ch a character (Unicode code point). 2573 * @param fromIndex the index to start the search from. There is no 2574 * restriction on the value of {@code fromIndex}. If it is 2575 * greater than or equal to the length of this string, it has 2576 * the same effect as if it were equal to one less than the 2577 * length of this string: this entire string may be searched. 2578 * If it is negative, it has the same effect as if it were -1: 2579 * -1 is returned. 2580 * @return the index of the last occurrence of the character in the 2581 * character sequence represented by this object that is less 2582 * than or equal to {@code fromIndex}, or {@code -1} 2583 * if the character does not occur before that point. 2584 */ 2585 public int lastIndexOf(int ch, int fromIndex) { 2586 return isLatin1() ? StringLatin1.lastIndexOf(value, ch, fromIndex) 2587 : StringUTF16.lastIndexOf(value, ch, fromIndex); 2588 } 2589 2590 /** 2591 * Returns the index within this string of the first occurrence of the 2592 * specified substring. 2593 * 2594 * <p>The returned index is the smallest value {@code k} for which: 2595 * <pre>{@code 2596 * this.startsWith(str, k) 2597 * }</pre> 2598 * If no such value of {@code k} exists, then {@code -1} is returned. 2599 * 2600 * @param str the substring to search for. 2601 * @return the index of the first occurrence of the specified substring, 2602 * or {@code -1} if there is no such occurrence. 2603 */ 2604 public int indexOf(String str) { 2605 byte coder = coder(); 2606 if (coder == str.coder()) { 2607 return isLatin1() ? StringLatin1.indexOf(value, str.value) 2608 : StringUTF16.indexOf(value, str.value); 2609 } 2610 if (coder == LATIN1) { // str.coder == UTF16 2611 return -1; 2612 } 2613 return StringUTF16.indexOfLatin1(value, str.value); 2614 } 2615 2616 /** 2617 * Returns the index within this string of the first occurrence of the 2618 * specified substring, starting at the specified index. 2619 * 2620 * <p>The returned index is the smallest value {@code k} for which: 2621 * <pre>{@code 2622 * k >= Math.min(fromIndex, this.length()) && 2623 * this.startsWith(str, k) 2624 * }</pre> 2625 * If no such value of {@code k} exists, then {@code -1} is returned. 2626 * 2627 * @apiNote 2628 * Unlike {@link #substring(int)}, for example, this method does not throw 2629 * an exception when {@code fromIndex} is outside the valid range. 2630 * Rather, it returns -1 when {@code fromIndex} is larger than the length of 2631 * the string. 2632 * This result is, by itself, indistinguishable from a genuine absence of 2633 * {@code str} in the string. 2634 * If stricter behavior is needed, {@link #indexOf(String, int, int)} 2635 * should be considered instead. 2636 * On {@link String} {@code s} and a non-empty {@code str}, for example, 2637 * {@code s.indexOf(str, fromIndex, s.length())} would throw if 2638 * {@code fromIndex} were larger than the string length, or were negative. 2639 * 2640 * @param str the substring to search for. 2641 * @param fromIndex the index from which to start the search. 2642 * @return the index of the first occurrence of the specified substring, 2643 * starting at the specified index, 2644 * or {@code -1} if there is no such occurrence. 2645 */ 2646 public int indexOf(String str, int fromIndex) { 2647 return indexOf(value, coder(), length(), str, fromIndex); 2648 } 2649 2650 /** 2651 * Returns the index of the first occurrence of the specified substring 2652 * within the specified index range of {@code this} string. 2653 * 2654 * <p>This method returns the same result as the one of the invocation 2655 * <pre>{@code 2656 * s.substring(beginIndex, endIndex).indexOf(str) + beginIndex 2657 * }</pre> 2658 * if the index returned by {@link #indexOf(String)} is non-negative, 2659 * and returns -1 otherwise. 2660 * (No substring is instantiated, though.) 2661 * 2662 * @param str the substring to search for. 2663 * @param beginIndex the index to start the search from (included). 2664 * @param endIndex the index to stop the search at (excluded). 2665 * @return the index of the first occurrence of the specified substring 2666 * within the specified index range, 2667 * or {@code -1} if there is no such occurrence. 2668 * @throws StringIndexOutOfBoundsException if {@code beginIndex} 2669 * is negative, or {@code endIndex} is larger than the length of 2670 * this {@code String} object, or {@code beginIndex} is larger than 2671 * {@code endIndex}. 2672 * @since 21 2673 */ 2674 public int indexOf(String str, int beginIndex, int endIndex) { 2675 if (str.length() == 1) { 2676 /* Simple optimization, can be omitted without behavioral impact */ 2677 return indexOf(str.charAt(0), beginIndex, endIndex); 2678 } 2679 checkBoundsBeginEnd(beginIndex, endIndex, length()); 2680 return indexOf(value, coder(), endIndex, str, beginIndex); 2681 } 2682 2683 /** 2684 * Code shared by String and AbstractStringBuilder to do searches. The 2685 * source is the character array being searched, and the target 2686 * is the string being searched for. 2687 * 2688 * @param src the characters being searched. 2689 * @param srcCoder the coder of the source string. 2690 * @param srcCount last index (exclusive) in the source string. 2691 * @param tgtStr the characters being searched for. 2692 * @param fromIndex the index to begin searching from. 2693 */ 2694 static int indexOf(byte[] src, byte srcCoder, int srcCount, 2695 String tgtStr, int fromIndex) { 2696 fromIndex = Math.clamp(fromIndex, 0, srcCount); 2697 int tgtCount = tgtStr.length(); 2698 if (tgtCount > srcCount - fromIndex) { 2699 return -1; 2700 } 2701 if (tgtCount == 0) { 2702 return fromIndex; 2703 } 2704 2705 byte[] tgt = tgtStr.value; 2706 byte tgtCoder = tgtStr.coder(); 2707 if (srcCoder == tgtCoder) { 2708 return srcCoder == LATIN1 2709 ? StringLatin1.indexOf(src, srcCount, tgt, tgtCount, fromIndex) 2710 : StringUTF16.indexOf(src, srcCount, tgt, tgtCount, fromIndex); 2711 } 2712 if (srcCoder == LATIN1) { // && tgtCoder == UTF16 2713 return -1; 2714 } 2715 // srcCoder == UTF16 && tgtCoder == LATIN1) { 2716 return StringUTF16.indexOfLatin1(src, srcCount, tgt, tgtCount, fromIndex); 2717 } 2718 2719 /** 2720 * Returns the index within this string of the last occurrence of the 2721 * specified substring. The last occurrence of the empty string "" 2722 * is considered to occur at the index value {@code this.length()}. 2723 * 2724 * <p>The returned index is the largest value {@code k} for which: 2725 * <pre>{@code 2726 * this.startsWith(str, k) 2727 * }</pre> 2728 * If no such value of {@code k} exists, then {@code -1} is returned. 2729 * 2730 * @param str the substring to search for. 2731 * @return the index of the last occurrence of the specified substring, 2732 * or {@code -1} if there is no such occurrence. 2733 */ 2734 public int lastIndexOf(String str) { 2735 return lastIndexOf(str, length()); 2736 } 2737 2738 /** 2739 * Returns the index within this string of the last occurrence of the 2740 * specified substring, searching backward starting at the specified index. 2741 * 2742 * <p>The returned index is the largest value {@code k} for which: 2743 * <pre>{@code 2744 * k <= Math.min(fromIndex, this.length()) && 2745 * this.startsWith(str, k) 2746 * }</pre> 2747 * If no such value of {@code k} exists, then {@code -1} is returned. 2748 * 2749 * @param str the substring to search for. 2750 * @param fromIndex the index to start the search from. 2751 * @return the index of the last occurrence of the specified substring, 2752 * searching backward from the specified index, 2753 * or {@code -1} if there is no such occurrence. 2754 */ 2755 public int lastIndexOf(String str, int fromIndex) { 2756 return lastIndexOf(value, coder(), length(), str, fromIndex); 2757 } 2758 2759 /** 2760 * Code shared by String and AbstractStringBuilder to do searches. The 2761 * source is the character array being searched, and the target 2762 * is the string being searched for. 2763 * 2764 * @param src the characters being searched. 2765 * @param srcCoder coder handles the mapping between bytes/chars 2766 * @param srcCount count of the source string. 2767 * @param tgtStr the characters being searched for. 2768 * @param fromIndex the index to begin searching from. 2769 */ 2770 static int lastIndexOf(byte[] src, byte srcCoder, int srcCount, 2771 String tgtStr, int fromIndex) { 2772 byte[] tgt = tgtStr.value; 2773 byte tgtCoder = tgtStr.coder(); 2774 int tgtCount = tgtStr.length(); 2775 /* 2776 * Check arguments; return immediately where possible. For 2777 * consistency, don't check for null str. 2778 */ 2779 int rightIndex = srcCount - tgtCount; 2780 if (fromIndex > rightIndex) { 2781 fromIndex = rightIndex; 2782 } 2783 if (fromIndex < 0) { 2784 return -1; 2785 } 2786 /* Empty string always matches. */ 2787 if (tgtCount == 0) { 2788 return fromIndex; 2789 } 2790 if (srcCoder == tgtCoder) { 2791 return srcCoder == LATIN1 2792 ? StringLatin1.lastIndexOf(src, srcCount, tgt, tgtCount, fromIndex) 2793 : StringUTF16.lastIndexOf(src, srcCount, tgt, tgtCount, fromIndex); 2794 } 2795 if (srcCoder == LATIN1) { // && tgtCoder == UTF16 2796 return -1; 2797 } 2798 // srcCoder == UTF16 && tgtCoder == LATIN1 2799 return StringUTF16.lastIndexOfLatin1(src, srcCount, tgt, tgtCount, fromIndex); 2800 } 2801 2802 /** 2803 * Returns a string that is a substring of this string. The 2804 * substring begins with the character at the specified index and 2805 * extends to the end of this string. <p> 2806 * Examples: 2807 * <blockquote><pre> 2808 * "unhappy".substring(2) returns "happy" 2809 * "Harbison".substring(3) returns "bison" 2810 * "emptiness".substring(9) returns "" (an empty string) 2811 * </pre></blockquote> 2812 * 2813 * @param beginIndex the beginning index, inclusive. 2814 * @return the specified substring. 2815 * @throws IndexOutOfBoundsException if 2816 * {@code beginIndex} is negative or larger than the 2817 * length of this {@code String} object. 2818 */ 2819 public String substring(int beginIndex) { 2820 return substring(beginIndex, length()); 2821 } 2822 2823 /** 2824 * Returns a string that is a substring of this string. The 2825 * substring begins at the specified {@code beginIndex} and 2826 * extends to the character at index {@code endIndex - 1}. 2827 * Thus the length of the substring is {@code endIndex-beginIndex}. 2828 * <p> 2829 * Examples: 2830 * <blockquote><pre> 2831 * "hamburger".substring(4, 8) returns "urge" 2832 * "smiles".substring(1, 5) returns "mile" 2833 * </pre></blockquote> 2834 * 2835 * @param beginIndex the beginning index, inclusive. 2836 * @param endIndex the ending index, exclusive. 2837 * @return the specified substring. 2838 * @throws IndexOutOfBoundsException if the 2839 * {@code beginIndex} is negative, or 2840 * {@code endIndex} is larger than the length of 2841 * this {@code String} object, or 2842 * {@code beginIndex} is larger than 2843 * {@code endIndex}. 2844 */ 2845 public String substring(int beginIndex, int endIndex) { 2846 int length = length(); 2847 checkBoundsBeginEnd(beginIndex, endIndex, length); 2848 if (beginIndex == 0 && endIndex == length) { 2849 return this; 2850 } 2851 int subLen = endIndex - beginIndex; 2852 return isLatin1() ? StringLatin1.newString(value, beginIndex, subLen) 2853 : StringUTF16.newString(value, beginIndex, subLen); 2854 } 2855 2856 /** 2857 * Returns a character sequence that is a subsequence of this sequence. 2858 * 2859 * <p> An invocation of this method of the form 2860 * 2861 * <blockquote><pre> 2862 * str.subSequence(begin, end)</pre></blockquote> 2863 * 2864 * behaves in exactly the same way as the invocation 2865 * 2866 * <blockquote><pre> 2867 * str.substring(begin, end)</pre></blockquote> 2868 * 2869 * @apiNote 2870 * This method is defined so that the {@code String} class can implement 2871 * the {@link CharSequence} interface. 2872 * 2873 * @param beginIndex the begin index, inclusive. 2874 * @param endIndex the end index, exclusive. 2875 * @return the specified subsequence. 2876 * 2877 * @throws IndexOutOfBoundsException 2878 * if {@code beginIndex} or {@code endIndex} is negative, 2879 * if {@code endIndex} is greater than {@code length()}, 2880 * or if {@code beginIndex} is greater than {@code endIndex} 2881 * 2882 * @since 1.4 2883 */ 2884 public CharSequence subSequence(int beginIndex, int endIndex) { 2885 return this.substring(beginIndex, endIndex); 2886 } 2887 2888 /** 2889 * Concatenates the specified string to the end of this string. 2890 * <p> 2891 * If the length of the argument string is {@code 0}, then this 2892 * {@code String} object is returned. Otherwise, a 2893 * {@code String} object is returned that represents a character 2894 * sequence that is the concatenation of the character sequence 2895 * represented by this {@code String} object and the character 2896 * sequence represented by the argument string.<p> 2897 * Examples: 2898 * <blockquote><pre> 2899 * "cares".concat("s") returns "caress" 2900 * "to".concat("get").concat("her") returns "together" 2901 * </pre></blockquote> 2902 * 2903 * @param str the {@code String} that is concatenated to the end 2904 * of this {@code String}. 2905 * @return a string that represents the concatenation of this object's 2906 * characters followed by the string argument's characters. 2907 */ 2908 public String concat(String str) { 2909 if (str.isEmpty()) { 2910 return this; 2911 } 2912 return StringConcatHelper.simpleConcat(this, str); 2913 } 2914 2915 /** 2916 * Returns a string resulting from replacing all occurrences of 2917 * {@code oldChar} in this string with {@code newChar}. 2918 * <p> 2919 * If the character {@code oldChar} does not occur in the 2920 * character sequence represented by this {@code String} object, 2921 * then a reference to this {@code String} object is returned. 2922 * Otherwise, a {@code String} object is returned that 2923 * represents a character sequence identical to the character sequence 2924 * represented by this {@code String} object, except that every 2925 * occurrence of {@code oldChar} is replaced by an occurrence 2926 * of {@code newChar}. 2927 * <p> 2928 * Examples: 2929 * <blockquote><pre> 2930 * "mesquite in your cellar".replace('e', 'o') 2931 * returns "mosquito in your collar" 2932 * "the war of baronets".replace('r', 'y') 2933 * returns "the way of bayonets" 2934 * "sparring with a purple porpoise".replace('p', 't') 2935 * returns "starring with a turtle tortoise" 2936 * "JonL".replace('q', 'x') returns "JonL" (no change) 2937 * </pre></blockquote> 2938 * 2939 * @param oldChar the old character. 2940 * @param newChar the new character. 2941 * @return a string derived from this string by replacing every 2942 * occurrence of {@code oldChar} with {@code newChar}. 2943 */ 2944 public String replace(char oldChar, char newChar) { 2945 if (oldChar != newChar) { 2946 String ret = isLatin1() ? StringLatin1.replace(value, oldChar, newChar) 2947 : StringUTF16.replace(value, oldChar, newChar); 2948 if (ret != null) { 2949 return ret; 2950 } 2951 } 2952 return this; 2953 } 2954 2955 /** 2956 * Tells whether or not this string matches the given <a 2957 * href="../util/regex/Pattern.html#sum">regular expression</a>. 2958 * 2959 * <p> An invocation of this method of the form 2960 * <i>str</i>{@code .matches(}<i>regex</i>{@code )} yields exactly the 2961 * same result as the expression 2962 * 2963 * <blockquote> 2964 * {@link java.util.regex.Pattern}.{@link java.util.regex.Pattern#matches(String,CharSequence) 2965 * matches(<i>regex</i>, <i>str</i>)} 2966 * </blockquote> 2967 * 2968 * @param regex 2969 * the regular expression to which this string is to be matched 2970 * 2971 * @return {@code true} if, and only if, this string matches the 2972 * given regular expression 2973 * 2974 * @throws PatternSyntaxException 2975 * if the regular expression's syntax is invalid 2976 * 2977 * @see java.util.regex.Pattern 2978 * 2979 * @since 1.4 2980 */ 2981 public boolean matches(String regex) { 2982 return Pattern.matches(regex, this); 2983 } 2984 2985 /** 2986 * Returns true if and only if this string contains the specified 2987 * sequence of char values. 2988 * 2989 * @param s the sequence to search for 2990 * @return true if this string contains {@code s}, false otherwise 2991 * @since 1.5 2992 */ 2993 public boolean contains(CharSequence s) { 2994 return indexOf(s.toString()) >= 0; 2995 } 2996 2997 /** 2998 * Replaces the first substring of this string that matches the given <a 2999 * href="../util/regex/Pattern.html#sum">regular expression</a> with the 3000 * given replacement. 3001 * 3002 * <p> An invocation of this method of the form 3003 * <i>str</i>{@code .replaceFirst(}<i>regex</i>{@code ,} <i>repl</i>{@code )} 3004 * yields exactly the same result as the expression 3005 * 3006 * <blockquote> 3007 * <code> 3008 * {@link java.util.regex.Pattern}.{@link 3009 * java.util.regex.Pattern#compile(String) compile}(<i>regex</i>).{@link 3010 * java.util.regex.Pattern#matcher(java.lang.CharSequence) matcher}(<i>str</i>).{@link 3011 * java.util.regex.Matcher#replaceFirst(String) replaceFirst}(<i>repl</i>) 3012 * </code> 3013 * </blockquote> 3014 * 3015 *<p> 3016 * Note that backslashes ({@code \}) and dollar signs ({@code $}) in the 3017 * replacement string may cause the results to be different than if it were 3018 * being treated as a literal replacement string; see 3019 * {@link java.util.regex.Matcher#replaceFirst}. 3020 * Use {@link java.util.regex.Matcher#quoteReplacement} to suppress the special 3021 * meaning of these characters, if desired. 3022 * 3023 * @param regex 3024 * the regular expression to which this string is to be matched 3025 * @param replacement 3026 * the string to be substituted for the first match 3027 * 3028 * @return The resulting {@code String} 3029 * 3030 * @throws PatternSyntaxException 3031 * if the regular expression's syntax is invalid 3032 * 3033 * @see java.util.regex.Pattern 3034 * 3035 * @since 1.4 3036 */ 3037 public String replaceFirst(String regex, String replacement) { 3038 return Pattern.compile(regex).matcher(this).replaceFirst(replacement); 3039 } 3040 3041 /** 3042 * Replaces each substring of this string that matches the given <a 3043 * href="../util/regex/Pattern.html#sum">regular expression</a> with the 3044 * given replacement. 3045 * 3046 * <p> An invocation of this method of the form 3047 * <i>str</i>{@code .replaceAll(}<i>regex</i>{@code ,} <i>repl</i>{@code )} 3048 * yields exactly the same result as the expression 3049 * 3050 * <blockquote> 3051 * <code> 3052 * {@link java.util.regex.Pattern}.{@link 3053 * java.util.regex.Pattern#compile(String) compile}(<i>regex</i>).{@link 3054 * java.util.regex.Pattern#matcher(java.lang.CharSequence) matcher}(<i>str</i>).{@link 3055 * java.util.regex.Matcher#replaceAll(String) replaceAll}(<i>repl</i>) 3056 * </code> 3057 * </blockquote> 3058 * 3059 *<p> 3060 * Note that backslashes ({@code \}) and dollar signs ({@code $}) in the 3061 * replacement string may cause the results to be different than if it were 3062 * being treated as a literal replacement string; see 3063 * {@link java.util.regex.Matcher#replaceAll Matcher.replaceAll}. 3064 * Use {@link java.util.regex.Matcher#quoteReplacement} to suppress the special 3065 * meaning of these characters, if desired. 3066 * 3067 * @param regex 3068 * the regular expression to which this string is to be matched 3069 * @param replacement 3070 * the string to be substituted for each match 3071 * 3072 * @return The resulting {@code String} 3073 * 3074 * @throws PatternSyntaxException 3075 * if the regular expression's syntax is invalid 3076 * 3077 * @see java.util.regex.Pattern 3078 * 3079 * @since 1.4 3080 */ 3081 public String replaceAll(String regex, String replacement) { 3082 return Pattern.compile(regex).matcher(this).replaceAll(replacement); 3083 } 3084 3085 /** 3086 * Replaces each substring of this string that matches the literal target 3087 * sequence with the specified literal replacement sequence. The 3088 * replacement proceeds from the beginning of the string to the end, for 3089 * example, replacing "aa" with "b" in the string "aaa" will result in 3090 * "ba" rather than "ab". 3091 * 3092 * @param target The sequence of char values to be replaced 3093 * @param replacement The replacement sequence of char values 3094 * @return The resulting string 3095 * @since 1.5 3096 */ 3097 public String replace(CharSequence target, CharSequence replacement) { 3098 String trgtStr = target.toString(); 3099 String replStr = replacement.toString(); 3100 int thisLen = length(); 3101 int trgtLen = trgtStr.length(); 3102 int replLen = replStr.length(); 3103 3104 if (trgtLen > 0) { 3105 if (trgtLen == 1 && replLen == 1) { 3106 return replace(trgtStr.charAt(0), replStr.charAt(0)); 3107 } 3108 3109 boolean thisIsLatin1 = this.isLatin1(); 3110 boolean trgtIsLatin1 = trgtStr.isLatin1(); 3111 boolean replIsLatin1 = replStr.isLatin1(); 3112 String ret = (thisIsLatin1 && trgtIsLatin1 && replIsLatin1) 3113 ? StringLatin1.replace(value, thisLen, 3114 trgtStr.value, trgtLen, 3115 replStr.value, replLen) 3116 : StringUTF16.replace(value, thisLen, thisIsLatin1, 3117 trgtStr.value, trgtLen, trgtIsLatin1, 3118 replStr.value, replLen, replIsLatin1); 3119 if (ret != null) { 3120 return ret; 3121 } 3122 return this; 3123 3124 } else { // trgtLen == 0 3125 int resultLen; 3126 try { 3127 resultLen = Math.addExact(thisLen, Math.multiplyExact( 3128 Math.addExact(thisLen, 1), replLen)); 3129 } catch (ArithmeticException ignored) { 3130 throw new OutOfMemoryError("Required length exceeds implementation limit"); 3131 } 3132 3133 StringBuilder sb = new StringBuilder(resultLen); 3134 sb.append(replStr); 3135 for (int i = 0; i < thisLen; ++i) { 3136 sb.append(charAt(i)).append(replStr); 3137 } 3138 return sb.toString(); 3139 } 3140 } 3141 3142 /** 3143 * Splits this string around matches of the given 3144 * <a href="../util/regex/Pattern.html#sum">regular expression</a>. 3145 * 3146 * <p> The array returned by this method contains each substring of this 3147 * string that is terminated by another substring that matches the given 3148 * expression or is terminated by the end of the string. The substrings in 3149 * the array are in the order in which they occur in this string. If the 3150 * expression does not match any part of the input then the resulting array 3151 * has just one element, namely this string. 3152 * 3153 * <p> When there is a positive-width match at the beginning of this 3154 * string then an empty leading substring is included at the beginning 3155 * of the resulting array. A zero-width match at the beginning however 3156 * never produces such empty leading substring. 3157 * 3158 * <p> The {@code limit} parameter controls the number of times the 3159 * pattern is applied and therefore affects the length of the resulting 3160 * array. 3161 * <ul> 3162 * <li><p> 3163 * If the <i>limit</i> is positive then the pattern will be applied 3164 * at most <i>limit</i> - 1 times, the array's length will be 3165 * no greater than <i>limit</i>, and the array's last entry will contain 3166 * all input beyond the last matched delimiter.</p></li> 3167 * 3168 * <li><p> 3169 * If the <i>limit</i> is zero then the pattern will be applied as 3170 * many times as possible, the array can have any length, and trailing 3171 * empty strings will be discarded.</p></li> 3172 * 3173 * <li><p> 3174 * If the <i>limit</i> is negative then the pattern will be applied 3175 * as many times as possible and the array can have any length.</p></li> 3176 * </ul> 3177 * 3178 * <p> The string {@code "boo:and:foo"}, for example, yields the 3179 * following results with these parameters: 3180 * 3181 * <blockquote><table class="plain"> 3182 * <caption style="display:none">Split example showing regex, limit, and result</caption> 3183 * <thead> 3184 * <tr> 3185 * <th scope="col">Regex</th> 3186 * <th scope="col">Limit</th> 3187 * <th scope="col">Result</th> 3188 * </tr> 3189 * </thead> 3190 * <tbody> 3191 * <tr><th scope="row" rowspan="3" style="font-weight:normal">:</th> 3192 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">2</th> 3193 * <td>{@code { "boo", "and:foo" }}</td></tr> 3194 * <tr><!-- : --> 3195 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th> 3196 * <td>{@code { "boo", "and", "foo" }}</td></tr> 3197 * <tr><!-- : --> 3198 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-2</th> 3199 * <td>{@code { "boo", "and", "foo" }}</td></tr> 3200 * <tr><th scope="row" rowspan="3" style="font-weight:normal">o</th> 3201 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th> 3202 * <td>{@code { "b", "", ":and:f", "", "" }}</td></tr> 3203 * <tr><!-- o --> 3204 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-2</th> 3205 * <td>{@code { "b", "", ":and:f", "", "" }}</td></tr> 3206 * <tr><!-- o --> 3207 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">0</th> 3208 * <td>{@code { "b", "", ":and:f" }}</td></tr> 3209 * </tbody> 3210 * </table></blockquote> 3211 * 3212 * <p> An invocation of this method of the form 3213 * <i>str.</i>{@code split(}<i>regex</i>{@code ,} <i>n</i>{@code )} 3214 * yields the same result as the expression 3215 * 3216 * <blockquote> 3217 * <code> 3218 * {@link java.util.regex.Pattern}.{@link 3219 * java.util.regex.Pattern#compile(String) compile}(<i>regex</i>).{@link 3220 * java.util.regex.Pattern#split(java.lang.CharSequence,int) split}(<i>str</i>, <i>n</i>) 3221 * </code> 3222 * </blockquote> 3223 * 3224 * 3225 * @param regex 3226 * the delimiting regular expression 3227 * 3228 * @param limit 3229 * the result threshold, as described above 3230 * 3231 * @return the array of strings computed by splitting this string 3232 * around matches of the given regular expression 3233 * 3234 * @throws PatternSyntaxException 3235 * if the regular expression's syntax is invalid 3236 * 3237 * @see java.util.regex.Pattern 3238 * 3239 * @since 1.4 3240 */ 3241 public String[] split(String regex, int limit) { 3242 return split(regex, limit, false); 3243 } 3244 3245 /** 3246 * Splits this string around matches of the given regular expression and 3247 * returns both the strings and the matching delimiters. 3248 * 3249 * <p> The array returned by this method contains each substring of this 3250 * string that is terminated by another substring that matches the given 3251 * expression or is terminated by the end of the string. 3252 * Each substring is immediately followed by the subsequence (the delimiter) 3253 * that matches the given expression, <em>except</em> for the last 3254 * substring, which is not followed by anything. 3255 * The substrings in the array and the delimiters are in the order in which 3256 * they occur in the input. 3257 * If the expression does not match any part of the input then the resulting 3258 * array has just one element, namely this string. 3259 * 3260 * <p> When there is a positive-width match at the beginning of this 3261 * string then an empty leading substring is included at the beginning 3262 * of the resulting array. A zero-width match at the beginning however 3263 * never produces such empty leading substring nor the empty delimiter. 3264 * 3265 * <p> The {@code limit} parameter controls the number of times the 3266 * pattern is applied and therefore affects the length of the resulting 3267 * array. 3268 * <ul> 3269 * <li> If the <i>limit</i> is positive then the pattern will be applied 3270 * at most <i>limit</i> - 1 times, the array's length will be 3271 * no greater than 2 × <i>limit</i> - 1, and the array's last 3272 * entry will contain all input beyond the last matched delimiter.</li> 3273 * 3274 * <li> If the <i>limit</i> is zero then the pattern will be applied as 3275 * many times as possible, the array can have any length, and trailing 3276 * empty strings will be discarded.</li> 3277 * 3278 * <li> If the <i>limit</i> is negative then the pattern will be applied 3279 * as many times as possible and the array can have any length.</li> 3280 * </ul> 3281 * 3282 * <p> The input {@code "boo:::and::foo"}, for example, yields the following 3283 * results with these parameters: 3284 * 3285 * <table class="plain" style="margin-left:2em;"> 3286 * <caption style="display:none">Split example showing regex, limit, and result</caption> 3287 * <thead> 3288 * <tr> 3289 * <th scope="col">Regex</th> 3290 * <th scope="col">Limit</th> 3291 * <th scope="col">Result</th> 3292 * </tr> 3293 * </thead> 3294 * <tbody> 3295 * <tr><th scope="row" rowspan="3" style="font-weight:normal">:+</th> 3296 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">2</th> 3297 * <td>{@code { "boo", ":::", "and::foo" }}</td></tr> 3298 * <tr><!-- : --> 3299 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th> 3300 * <td>{@code { "boo", ":::", "and", "::", "foo" }}</td></tr> 3301 * <tr><!-- : --> 3302 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-1</th> 3303 * <td>{@code { "boo", ":::", "and", "::", "foo" }}</td></tr> 3304 * <tr><th scope="row" rowspan="3" style="font-weight:normal">o</th> 3305 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th> 3306 * <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o", "" }}</td></tr> 3307 * <tr><!-- o --> 3308 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-1</th> 3309 * <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o", "" }}</td></tr> 3310 * <tr><!-- o --> 3311 * <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">0</th> 3312 * <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o" }}</td></tr> 3313 * </tbody> 3314 * </table> 3315 * 3316 * @apiNote An invocation of this method of the form 3317 * <i>str.</i>{@code splitWithDelimiters(}<i>regex</i>{@code ,} <i>n</i>{@code )} 3318 * yields the same result as the expression 3319 * 3320 * <blockquote> 3321 * <code> 3322 * {@link java.util.regex.Pattern}.{@link 3323 * java.util.regex.Pattern#compile(String) compile}(<i>regex</i>).{@link 3324 * java.util.regex.Pattern#splitWithDelimiters(CharSequence,int) splitWithDelimiters}(<i>str</i>, <i>n</i>) 3325 * </code> 3326 * </blockquote> 3327 * 3328 * @param regex 3329 * the delimiting regular expression 3330 * 3331 * @param limit 3332 * the result threshold, as described above 3333 * 3334 * @return the array of strings computed by splitting this string 3335 * around matches of the given regular expression, alternating 3336 * substrings and matching delimiters 3337 * 3338 * @since 21 3339 */ 3340 public String[] splitWithDelimiters(String regex, int limit) { 3341 return split(regex, limit, true); 3342 } 3343 3344 private String[] split(String regex, int limit, boolean withDelimiters) { 3345 /* fastpath if the regex is a 3346 * (1) one-char String and this character is not one of the 3347 * RegEx's meta characters ".$|()[{^?*+\\", or 3348 * (2) two-char String and the first char is the backslash and 3349 * the second is not the ascii digit or ascii letter. 3350 */ 3351 char ch = 0; 3352 if (((regex.length() == 1 && 3353 ".$|()[{^?*+\\".indexOf(ch = regex.charAt(0)) == -1) || 3354 (regex.length() == 2 && 3355 regex.charAt(0) == '\\' && 3356 (((ch = regex.charAt(1))-'0')|('9'-ch)) < 0 && 3357 ((ch-'a')|('z'-ch)) < 0 && 3358 ((ch-'A')|('Z'-ch)) < 0)) && 3359 (ch < Character.MIN_HIGH_SURROGATE || 3360 ch > Character.MAX_LOW_SURROGATE)) 3361 { 3362 // All the checks above can potentially be constant folded by 3363 // a JIT/AOT compiler when the regex is a constant string. 3364 // That requires method inlining of the checks, which is only 3365 // possible when the actual split logic is in a separate method 3366 // because the large split loop can usually not be inlined. 3367 return split(ch, limit, withDelimiters); 3368 } 3369 Pattern pattern = Pattern.compile(regex); 3370 return withDelimiters 3371 ? pattern.splitWithDelimiters(this, limit) 3372 : pattern.split(this, limit); 3373 } 3374 3375 private String[] split(char ch, int limit, boolean withDelimiters) { 3376 int matchCount = 0; 3377 int off = 0; 3378 int next; 3379 boolean limited = limit > 0; 3380 ArrayList<String> list = new ArrayList<>(); 3381 String del = withDelimiters ? String.valueOf(ch) : null; 3382 while ((next = indexOf(ch, off)) != -1) { 3383 if (!limited || matchCount < limit - 1) { 3384 list.add(substring(off, next)); 3385 if (withDelimiters) { 3386 list.add(del); 3387 } 3388 off = next + 1; 3389 ++matchCount; 3390 } else { // last one 3391 int last = length(); 3392 list.add(substring(off, last)); 3393 off = last; 3394 ++matchCount; 3395 break; 3396 } 3397 } 3398 // If no match was found, return this 3399 if (off == 0) 3400 return new String[] {this}; 3401 3402 // Add remaining segment 3403 if (!limited || matchCount < limit) 3404 list.add(substring(off, length())); 3405 3406 // Construct result 3407 int resultSize = list.size(); 3408 if (limit == 0) { 3409 while (resultSize > 0 && list.get(resultSize - 1).isEmpty()) { 3410 resultSize--; 3411 } 3412 } 3413 String[] result = new String[resultSize]; 3414 return list.subList(0, resultSize).toArray(result); 3415 } 3416 3417 /** 3418 * Splits this string around matches of the given <a 3419 * href="../util/regex/Pattern.html#sum">regular expression</a>. 3420 * 3421 * <p> This method works as if by invoking the two-argument {@link 3422 * #split(String, int) split} method with the given expression and a limit 3423 * argument of zero. Trailing empty strings are therefore not included in 3424 * the resulting array. 3425 * 3426 * <p> The string {@code "boo:and:foo"}, for example, yields the following 3427 * results with these expressions: 3428 * 3429 * <blockquote><table class="plain"> 3430 * <caption style="display:none">Split examples showing regex and result</caption> 3431 * <thead> 3432 * <tr> 3433 * <th scope="col">Regex</th> 3434 * <th scope="col">Result</th> 3435 * </tr> 3436 * </thead> 3437 * <tbody> 3438 * <tr><th scope="row" style="text-weight:normal">:</th> 3439 * <td>{@code { "boo", "and", "foo" }}</td></tr> 3440 * <tr><th scope="row" style="text-weight:normal">o</th> 3441 * <td>{@code { "b", "", ":and:f" }}</td></tr> 3442 * </tbody> 3443 * </table></blockquote> 3444 * 3445 * 3446 * @param regex 3447 * the delimiting regular expression 3448 * 3449 * @return the array of strings computed by splitting this string 3450 * around matches of the given regular expression 3451 * 3452 * @throws PatternSyntaxException 3453 * if the regular expression's syntax is invalid 3454 * 3455 * @see java.util.regex.Pattern 3456 * 3457 * @since 1.4 3458 */ 3459 public String[] split(String regex) { 3460 return split(regex, 0, false); 3461 } 3462 3463 /** 3464 * Returns a new String composed of copies of the 3465 * {@code CharSequence elements} joined together with a copy of 3466 * the specified {@code delimiter}. 3467 * 3468 * <blockquote>For example, 3469 * <pre>{@code 3470 * String message = String.join("-", "Java", "is", "cool"); 3471 * // message returned is: "Java-is-cool" 3472 * }</pre></blockquote> 3473 * 3474 * Note that if an element is null, then {@code "null"} is added. 3475 * 3476 * @param delimiter the delimiter that separates each element 3477 * @param elements the elements to join together. 3478 * 3479 * @return a new {@code String} that is composed of the {@code elements} 3480 * separated by the {@code delimiter} 3481 * 3482 * @throws NullPointerException If {@code delimiter} or {@code elements} 3483 * is {@code null} 3484 * 3485 * @see java.util.StringJoiner 3486 * @since 1.8 3487 */ 3488 public static String join(CharSequence delimiter, CharSequence... elements) { 3489 var delim = delimiter.toString(); 3490 var elems = new String[elements.length]; 3491 for (int i = 0; i < elements.length; i++) { 3492 elems[i] = String.valueOf(elements[i]); 3493 } 3494 return join("", "", delim, elems, elems.length); 3495 } 3496 3497 /** 3498 * Designated join routine. 3499 * 3500 * @param prefix the non-null prefix 3501 * @param suffix the non-null suffix 3502 * @param delimiter the non-null delimiter 3503 * @param elements the non-null array of non-null elements 3504 * @param size the number of elements in the array (<= elements.length) 3505 * @return the joined string 3506 */ 3507 @ForceInline 3508 static String join(String prefix, String suffix, String delimiter, String[] elements, int size) { 3509 int icoder = prefix.coder() | suffix.coder(); 3510 long len = (long) prefix.length() + suffix.length(); 3511 if (size > 1) { // when there are more than one element, size - 1 delimiters will be emitted 3512 len += (long) (size - 1) * delimiter.length(); 3513 icoder |= delimiter.coder(); 3514 } 3515 // assert len > 0L; // max: (long) Integer.MAX_VALUE << 32 3516 // following loop will add max: (long) Integer.MAX_VALUE * Integer.MAX_VALUE to len 3517 // so len can overflow at most once 3518 for (int i = 0; i < size; i++) { 3519 var el = elements[i]; 3520 len += el.length(); 3521 icoder |= el.coder(); 3522 } 3523 byte coder = (byte) icoder; 3524 // long len overflow check, char -> byte length, int len overflow check 3525 if (len < 0L || (len <<= coder) != (int) len) { 3526 throw new OutOfMemoryError("Requested string length exceeds VM limit"); 3527 } 3528 byte[] value = StringConcatHelper.newArray(len); 3529 3530 int off = 0; 3531 prefix.getBytes(value, off, coder); off += prefix.length(); 3532 if (size > 0) { 3533 var el = elements[0]; 3534 el.getBytes(value, off, coder); off += el.length(); 3535 for (int i = 1; i < size; i++) { 3536 delimiter.getBytes(value, off, coder); off += delimiter.length(); 3537 el = elements[i]; 3538 el.getBytes(value, off, coder); off += el.length(); 3539 } 3540 } 3541 suffix.getBytes(value, off, coder); 3542 // assert off + suffix.length() == value.length >> coder; 3543 3544 return new String(value, coder); 3545 } 3546 3547 /** 3548 * Returns a new {@code String} composed of copies of the 3549 * {@code CharSequence elements} joined together with a copy of the 3550 * specified {@code delimiter}. 3551 * 3552 * <blockquote>For example, 3553 * <pre>{@code 3554 * List<String> strings = List.of("Java", "is", "cool"); 3555 * String message = String.join(" ", strings); 3556 * // message returned is: "Java is cool" 3557 * 3558 * Set<String> strings = 3559 * new LinkedHashSet<>(List.of("Java", "is", "very", "cool")); 3560 * String message = String.join("-", strings); 3561 * // message returned is: "Java-is-very-cool" 3562 * }</pre></blockquote> 3563 * 3564 * Note that if an individual element is {@code null}, then {@code "null"} is added. 3565 * 3566 * @param delimiter a sequence of characters that is used to separate each 3567 * of the {@code elements} in the resulting {@code String} 3568 * @param elements an {@code Iterable} that will have its {@code elements} 3569 * joined together. 3570 * 3571 * @return a new {@code String} that is composed from the {@code elements} 3572 * argument 3573 * 3574 * @throws NullPointerException If {@code delimiter} or {@code elements} 3575 * is {@code null} 3576 * 3577 * @see #join(CharSequence,CharSequence...) 3578 * @see java.util.StringJoiner 3579 * @since 1.8 3580 */ 3581 public static String join(CharSequence delimiter, 3582 Iterable<? extends CharSequence> elements) { 3583 Objects.requireNonNull(delimiter); 3584 Objects.requireNonNull(elements); 3585 var delim = delimiter.toString(); 3586 var elems = new String[8]; 3587 int size = 0; 3588 for (CharSequence cs: elements) { 3589 if (size >= elems.length) { 3590 elems = Arrays.copyOf(elems, elems.length << 1); 3591 } 3592 elems[size++] = String.valueOf(cs); 3593 } 3594 return join("", "", delim, elems, size); 3595 } 3596 3597 /** 3598 * Converts all of the characters in this {@code String} to lower 3599 * case using the rules of the given {@code Locale}. Case mapping is based 3600 * on the Unicode Standard version specified by the {@link java.lang.Character Character} 3601 * class. Since case mappings are not always 1:1 char mappings, the resulting {@code String} 3602 * and this {@code String} may differ in length. 3603 * <p> 3604 * Examples of lowercase mappings are in the following table: 3605 * <table class="plain"> 3606 * <caption style="display:none">Lowercase mapping examples showing language code of locale, upper case, lower case, and description</caption> 3607 * <thead> 3608 * <tr> 3609 * <th scope="col">Language Code of Locale</th> 3610 * <th scope="col">Upper Case</th> 3611 * <th scope="col">Lower Case</th> 3612 * <th scope="col">Description</th> 3613 * </tr> 3614 * </thead> 3615 * <tbody> 3616 * <tr> 3617 * <td>tr (Turkish)</td> 3618 * <th scope="row" style="font-weight:normal; text-align:left">\u0130</th> 3619 * <td>\u0069</td> 3620 * <td>capital letter I with dot above -> small letter i</td> 3621 * </tr> 3622 * <tr> 3623 * <td>tr (Turkish)</td> 3624 * <th scope="row" style="font-weight:normal; text-align:left">\u0049</th> 3625 * <td>\u0131</td> 3626 * <td>capital letter I -> small letter dotless i </td> 3627 * </tr> 3628 * <tr> 3629 * <td>(all)</td> 3630 * <th scope="row" style="font-weight:normal; text-align:left">French Fries</th> 3631 * <td>french fries</td> 3632 * <td>lowercased all chars in String</td> 3633 * </tr> 3634 * <tr> 3635 * <td>(all)</td> 3636 * <th scope="row" style="font-weight:normal; text-align:left"> 3637 * ΙΧΘΥΣ</th> 3638 * <td>ιχθυσ</td> 3639 * <td>lowercased all chars in String</td> 3640 * </tr> 3641 * </tbody> 3642 * </table> 3643 * 3644 * @param locale use the case transformation rules for this locale 3645 * @return the {@code String}, converted to lowercase. 3646 * @see java.lang.String#toLowerCase() 3647 * @see java.lang.String#toUpperCase() 3648 * @see java.lang.String#toUpperCase(Locale) 3649 * @since 1.1 3650 */ 3651 public String toLowerCase(Locale locale) { 3652 return isLatin1() ? StringLatin1.toLowerCase(this, value, locale) 3653 : StringUTF16.toLowerCase(this, value, locale); 3654 } 3655 3656 /** 3657 * Converts all of the characters in this {@code String} to lower 3658 * case using the rules of the default locale. This method is equivalent to 3659 * {@code toLowerCase(Locale.getDefault())}. 3660 * 3661 * @apiNote This method is locale sensitive, and may produce unexpected 3662 * results if used for strings that are intended to be interpreted locale 3663 * independently. 3664 * Examples are programming language identifiers, protocol keys, and HTML 3665 * tags. 3666 * For instance, {@code "TITLE".toLowerCase()} in a Turkish locale 3667 * returns {@code "t\u005Cu0131tle"}, where '\u005Cu0131' is the 3668 * LATIN SMALL LETTER DOTLESS I character. 3669 * To obtain correct results for locale insensitive strings, use 3670 * {@code toLowerCase(Locale.ROOT)}. 3671 * 3672 * @return the {@code String}, converted to lowercase. 3673 * @see java.lang.String#toLowerCase(Locale) 3674 */ 3675 public String toLowerCase() { 3676 return toLowerCase(Locale.getDefault()); 3677 } 3678 3679 /** 3680 * Converts all of the characters in this {@code String} to upper 3681 * case using the rules of the given {@code Locale}. Case mapping is based 3682 * on the Unicode Standard version specified by the {@link java.lang.Character Character} 3683 * class. Since case mappings are not always 1:1 char mappings, the resulting {@code String} 3684 * and this {@code String} may differ in length. 3685 * <p> 3686 * Examples of locale-sensitive and 1:M case mappings are in the following table: 3687 * <table class="plain"> 3688 * <caption style="display:none">Examples of locale-sensitive and 1:M case mappings. Shows Language code of locale, lower case, upper case, and description.</caption> 3689 * <thead> 3690 * <tr> 3691 * <th scope="col">Language Code of Locale</th> 3692 * <th scope="col">Lower Case</th> 3693 * <th scope="col">Upper Case</th> 3694 * <th scope="col">Description</th> 3695 * </tr> 3696 * </thead> 3697 * <tbody> 3698 * <tr> 3699 * <td>tr (Turkish)</td> 3700 * <th scope="row" style="font-weight:normal; text-align:left">\u0069</th> 3701 * <td>\u0130</td> 3702 * <td>small letter i -> capital letter I with dot above</td> 3703 * </tr> 3704 * <tr> 3705 * <td>tr (Turkish)</td> 3706 * <th scope="row" style="font-weight:normal; text-align:left">\u0131</th> 3707 * <td>\u0049</td> 3708 * <td>small letter dotless i -> capital letter I</td> 3709 * </tr> 3710 * <tr> 3711 * <td>(all)</td> 3712 * <th scope="row" style="font-weight:normal; text-align:left">\u00df</th> 3713 * <td>\u0053 \u0053</td> 3714 * <td>small letter sharp s -> two letters: SS</td> 3715 * </tr> 3716 * <tr> 3717 * <td>(all)</td> 3718 * <th scope="row" style="font-weight:normal; text-align:left">Fahrvergnügen</th> 3719 * <td>FAHRVERGNÜGEN</td> 3720 * <td></td> 3721 * </tr> 3722 * </tbody> 3723 * </table> 3724 * @param locale use the case transformation rules for this locale 3725 * @return the {@code String}, converted to uppercase. 3726 * @see java.lang.String#toUpperCase() 3727 * @see java.lang.String#toLowerCase() 3728 * @see java.lang.String#toLowerCase(Locale) 3729 * @since 1.1 3730 */ 3731 public String toUpperCase(Locale locale) { 3732 return isLatin1() ? StringLatin1.toUpperCase(this, value, locale) 3733 : StringUTF16.toUpperCase(this, value, locale); 3734 } 3735 3736 /** 3737 * Converts all of the characters in this {@code String} to upper 3738 * case using the rules of the default locale. This method is equivalent to 3739 * {@code toUpperCase(Locale.getDefault())}. 3740 * 3741 * @apiNote This method is locale sensitive, and may produce unexpected 3742 * results if used for strings that are intended to be interpreted locale 3743 * independently. 3744 * Examples are programming language identifiers, protocol keys, and HTML 3745 * tags. 3746 * For instance, {@code "title".toUpperCase()} in a Turkish locale 3747 * returns {@code "T\u005Cu0130TLE"}, where '\u005Cu0130' is the 3748 * LATIN CAPITAL LETTER I WITH DOT ABOVE character. 3749 * To obtain correct results for locale insensitive strings, use 3750 * {@code toUpperCase(Locale.ROOT)}. 3751 * 3752 * @return the {@code String}, converted to uppercase. 3753 * @see java.lang.String#toUpperCase(Locale) 3754 */ 3755 public String toUpperCase() { 3756 return toUpperCase(Locale.getDefault()); 3757 } 3758 3759 /** 3760 * Returns a string whose value is this string, with all leading 3761 * and trailing space removed, where space is defined 3762 * as any character whose codepoint is less than or equal to 3763 * {@code 'U+0020'} (the space character). 3764 * <p> 3765 * If this {@code String} object represents an empty character 3766 * sequence, or the first and last characters of character sequence 3767 * represented by this {@code String} object both have codes 3768 * that are not space (as defined above), then a 3769 * reference to this {@code String} object is returned. 3770 * <p> 3771 * Otherwise, if all characters in this string are space (as 3772 * defined above), then a {@code String} object representing an 3773 * empty string is returned. 3774 * <p> 3775 * Otherwise, let <i>k</i> be the index of the first character in the 3776 * string whose code is not a space (as defined above) and let 3777 * <i>m</i> be the index of the last character in the string whose code 3778 * is not a space (as defined above). A {@code String} 3779 * object is returned, representing the substring of this string that 3780 * begins with the character at index <i>k</i> and ends with the 3781 * character at index <i>m</i>-that is, the result of 3782 * {@code this.substring(k, m + 1)}. 3783 * <p> 3784 * This method may be used to trim space (as defined above) from 3785 * the beginning and end of a string. 3786 * 3787 * @return a string whose value is this string, with all leading 3788 * and trailing space removed, or this string if it 3789 * has no leading or trailing space. 3790 */ 3791 public String trim() { 3792 String ret = isLatin1() ? StringLatin1.trim(value) 3793 : StringUTF16.trim(value); 3794 return ret == null ? this : ret; 3795 } 3796 3797 /** 3798 * Returns a string whose value is this string, with all leading 3799 * and trailing {@linkplain Character#isWhitespace(int) white space} 3800 * removed. 3801 * <p> 3802 * If this {@code String} object represents an empty string, 3803 * or if all code points in this string are 3804 * {@linkplain Character#isWhitespace(int) white space}, then an empty string 3805 * is returned. 3806 * <p> 3807 * Otherwise, returns a substring of this string beginning with the first 3808 * code point that is not a {@linkplain Character#isWhitespace(int) white space} 3809 * up to and including the last code point that is not a 3810 * {@linkplain Character#isWhitespace(int) white space}. 3811 * <p> 3812 * This method may be used to strip 3813 * {@linkplain Character#isWhitespace(int) white space} from 3814 * the beginning and end of a string. 3815 * 3816 * @return a string whose value is this string, with all leading 3817 * and trailing white space removed 3818 * 3819 * @see Character#isWhitespace(int) 3820 * 3821 * @since 11 3822 */ 3823 public String strip() { 3824 String ret = isLatin1() ? StringLatin1.strip(value) 3825 : StringUTF16.strip(value); 3826 return ret == null ? this : ret; 3827 } 3828 3829 /** 3830 * Returns a string whose value is this string, with all leading 3831 * {@linkplain Character#isWhitespace(int) white space} removed. 3832 * <p> 3833 * If this {@code String} object represents an empty string, 3834 * or if all code points in this string are 3835 * {@linkplain Character#isWhitespace(int) white space}, then an empty string 3836 * is returned. 3837 * <p> 3838 * Otherwise, returns a substring of this string beginning with the first 3839 * code point that is not a {@linkplain Character#isWhitespace(int) white space} 3840 * up to and including the last code point of this string. 3841 * <p> 3842 * This method may be used to trim 3843 * {@linkplain Character#isWhitespace(int) white space} from 3844 * the beginning of a string. 3845 * 3846 * @return a string whose value is this string, with all leading white 3847 * space removed 3848 * 3849 * @see Character#isWhitespace(int) 3850 * 3851 * @since 11 3852 */ 3853 public String stripLeading() { 3854 String ret = isLatin1() ? StringLatin1.stripLeading(value) 3855 : StringUTF16.stripLeading(value); 3856 return ret == null ? this : ret; 3857 } 3858 3859 /** 3860 * Returns a string whose value is this string, with all trailing 3861 * {@linkplain Character#isWhitespace(int) white space} removed. 3862 * <p> 3863 * If this {@code String} object represents an empty string, 3864 * or if all characters in this string are 3865 * {@linkplain Character#isWhitespace(int) white space}, then an empty string 3866 * is returned. 3867 * <p> 3868 * Otherwise, returns a substring of this string beginning with the first 3869 * code point of this string up to and including the last code point 3870 * that is not a {@linkplain Character#isWhitespace(int) white space}. 3871 * <p> 3872 * This method may be used to trim 3873 * {@linkplain Character#isWhitespace(int) white space} from 3874 * the end of a string. 3875 * 3876 * @return a string whose value is this string, with all trailing white 3877 * space removed 3878 * 3879 * @see Character#isWhitespace(int) 3880 * 3881 * @since 11 3882 */ 3883 public String stripTrailing() { 3884 String ret = isLatin1() ? StringLatin1.stripTrailing(value) 3885 : StringUTF16.stripTrailing(value); 3886 return ret == null ? this : ret; 3887 } 3888 3889 /** 3890 * Returns {@code true} if the string is empty or contains only 3891 * {@linkplain Character#isWhitespace(int) white space} codepoints, 3892 * otherwise {@code false}. 3893 * 3894 * @return {@code true} if the string is empty or contains only 3895 * {@linkplain Character#isWhitespace(int) white space} codepoints, 3896 * otherwise {@code false} 3897 * 3898 * @see Character#isWhitespace(int) 3899 * 3900 * @since 11 3901 */ 3902 public boolean isBlank() { 3903 return indexOfNonWhitespace() == length(); 3904 } 3905 3906 /** 3907 * Returns a stream of lines extracted from this string, 3908 * separated by line terminators. 3909 * <p> 3910 * A <i>line terminator</i> is one of the following: 3911 * a line feed character {@code "\n"} (U+000A), 3912 * a carriage return character {@code "\r"} (U+000D), 3913 * or a carriage return followed immediately by a line feed 3914 * {@code "\r\n"} (U+000D U+000A). 3915 * <p> 3916 * A <i>line</i> is either a sequence of zero or more characters 3917 * followed by a line terminator, or it is a sequence of one or 3918 * more characters followed by the end of the string. A 3919 * line does not include the line terminator. 3920 * <p> 3921 * The stream returned by this method contains the lines from 3922 * this string in the order in which they occur. 3923 * 3924 * @apiNote This definition of <i>line</i> implies that an empty 3925 * string has zero lines and that there is no empty line 3926 * following a line terminator at the end of a string. 3927 * 3928 * @implNote This method provides better performance than 3929 * split("\R") by supplying elements lazily and 3930 * by faster search of new line terminators. 3931 * 3932 * @return the stream of lines extracted from this string 3933 * 3934 * @since 11 3935 */ 3936 public Stream<String> lines() { 3937 return isLatin1() ? StringLatin1.lines(value) : StringUTF16.lines(value); 3938 } 3939 3940 /** 3941 * Adjusts the indentation of each line of this string based on the value of 3942 * {@code n}, and normalizes line termination characters. 3943 * <p> 3944 * This string is conceptually separated into lines using 3945 * {@link String#lines()}. Each line is then adjusted as described below 3946 * and then suffixed with a line feed {@code "\n"} (U+000A). The resulting 3947 * lines are then concatenated and returned. 3948 * <p> 3949 * If {@code n > 0} then {@code n} spaces (U+0020) are inserted at the 3950 * beginning of each line. 3951 * <p> 3952 * If {@code n < 0} then up to {@code n} 3953 * {@linkplain Character#isWhitespace(int) white space characters} are removed 3954 * from the beginning of each line. If a given line does not contain 3955 * sufficient white space then all leading 3956 * {@linkplain Character#isWhitespace(int) white space characters} are removed. 3957 * Each white space character is treated as a single character. In 3958 * particular, the tab character {@code "\t"} (U+0009) is considered a 3959 * single character; it is not expanded. 3960 * <p> 3961 * If {@code n == 0} then the line remains unchanged. However, line 3962 * terminators are still normalized. 3963 * 3964 * @param n number of leading 3965 * {@linkplain Character#isWhitespace(int) white space characters} 3966 * to add or remove 3967 * 3968 * @return string with indentation adjusted and line endings normalized 3969 * 3970 * @see String#lines() 3971 * @see String#isBlank() 3972 * @see Character#isWhitespace(int) 3973 * 3974 * @since 12 3975 */ 3976 public String indent(int n) { 3977 if (isEmpty()) { 3978 return ""; 3979 } 3980 Stream<String> stream = lines(); 3981 if (n > 0) { 3982 final String spaces = " ".repeat(n); 3983 stream = stream.map(s -> spaces + s); 3984 } else if (n == Integer.MIN_VALUE) { 3985 stream = stream.map(s -> s.stripLeading()); 3986 } else if (n < 0) { 3987 stream = stream.map(s -> s.substring(Math.min(-n, s.indexOfNonWhitespace()))); 3988 } 3989 return stream.collect(Collectors.joining("\n", "", "\n")); 3990 } 3991 3992 private int indexOfNonWhitespace() { 3993 return isLatin1() ? StringLatin1.indexOfNonWhitespace(value) 3994 : StringUTF16.indexOfNonWhitespace(value); 3995 } 3996 3997 private int lastIndexOfNonWhitespace() { 3998 return isLatin1() ? StringLatin1.lastIndexOfNonWhitespace(value) 3999 : StringUTF16.lastIndexOfNonWhitespace(value); 4000 } 4001 4002 /** 4003 * Returns a string whose value is this string, with incidental 4004 * {@linkplain Character#isWhitespace(int) white space} removed from 4005 * the beginning and end of every line. 4006 * <p> 4007 * Incidental {@linkplain Character#isWhitespace(int) white space} 4008 * is often present in a text block to align the content with the opening 4009 * delimiter. For example, in the following code, dots represent incidental 4010 * {@linkplain Character#isWhitespace(int) white space}: 4011 * <blockquote><pre> 4012 * String html = """ 4013 * ..............<html> 4014 * .............. <body> 4015 * .............. <p>Hello, world</p> 4016 * .............. </body> 4017 * ..............</html> 4018 * .............."""; 4019 * </pre></blockquote> 4020 * This method treats the incidental 4021 * {@linkplain Character#isWhitespace(int) white space} as indentation to be 4022 * stripped, producing a string that preserves the relative indentation of 4023 * the content. Using | to visualize the start of each line of the string: 4024 * <blockquote><pre> 4025 * |<html> 4026 * | <body> 4027 * | <p>Hello, world</p> 4028 * | </body> 4029 * |</html> 4030 * </pre></blockquote> 4031 * First, the individual lines of this string are extracted. A <i>line</i> 4032 * is a sequence of zero or more characters followed by either a line 4033 * terminator or the end of the string. 4034 * If the string has at least one line terminator, the last line consists 4035 * of the characters between the last terminator and the end of the string. 4036 * Otherwise, if the string has no terminators, the last line is the start 4037 * of the string to the end of the string, in other words, the entire 4038 * string. 4039 * A line does not include the line terminator. 4040 * <p> 4041 * Then, the <i>minimum indentation</i> (min) is determined as follows: 4042 * <ul> 4043 * <li><p>For each non-blank line (as defined by {@link String#isBlank()}), 4044 * the leading {@linkplain Character#isWhitespace(int) white space} 4045 * characters are counted.</p> 4046 * </li> 4047 * <li><p>The leading {@linkplain Character#isWhitespace(int) white space} 4048 * characters on the last line are also counted even if 4049 * {@linkplain String#isBlank() blank}.</p> 4050 * </li> 4051 * </ul> 4052 * <p>The <i>min</i> value is the smallest of these counts. 4053 * <p> 4054 * For each {@linkplain String#isBlank() non-blank} line, <i>min</i> leading 4055 * {@linkplain Character#isWhitespace(int) white space} characters are 4056 * removed, and any trailing {@linkplain Character#isWhitespace(int) white 4057 * space} characters are removed. {@linkplain String#isBlank() Blank} lines 4058 * are replaced with the empty string. 4059 * 4060 * <p> 4061 * Finally, the lines are joined into a new string, using the LF character 4062 * {@code "\n"} (U+000A) to separate lines. 4063 * 4064 * @apiNote 4065 * This method's primary purpose is to shift a block of lines as far as 4066 * possible to the left, while preserving relative indentation. Lines 4067 * that were indented the least will thus have no leading 4068 * {@linkplain Character#isWhitespace(int) white space}. 4069 * The result will have the same number of line terminators as this string. 4070 * If this string ends with a line terminator then the result will end 4071 * with a line terminator. 4072 * 4073 * @implSpec 4074 * This method treats all {@linkplain Character#isWhitespace(int) white space} 4075 * characters as having equal width. As long as the indentation on every 4076 * line is consistently composed of the same character sequences, then the 4077 * result will be as described above. 4078 * 4079 * @return string with incidental indentation removed and line 4080 * terminators normalized 4081 * 4082 * @see String#lines() 4083 * @see String#isBlank() 4084 * @see String#indent(int) 4085 * @see Character#isWhitespace(int) 4086 * 4087 * @since 15 4088 * 4089 */ 4090 public String stripIndent() { 4091 int length = length(); 4092 if (length == 0) { 4093 return ""; 4094 } 4095 char lastChar = charAt(length - 1); 4096 boolean optOut = lastChar == '\n' || lastChar == '\r'; 4097 List<String> lines = lines().toList(); 4098 final int outdent = optOut ? 0 : outdent(lines); 4099 return lines.stream() 4100 .map(line -> { 4101 int firstNonWhitespace = line.indexOfNonWhitespace(); 4102 int lastNonWhitespace = line.lastIndexOfNonWhitespace(); 4103 int incidentalWhitespace = Math.min(outdent, firstNonWhitespace); 4104 return firstNonWhitespace > lastNonWhitespace 4105 ? "" : line.substring(incidentalWhitespace, lastNonWhitespace); 4106 }) 4107 .collect(Collectors.joining("\n", "", optOut ? "\n" : "")); 4108 } 4109 4110 private static int outdent(List<String> lines) { 4111 // Note: outdent is guaranteed to be zero or positive number. 4112 // If there isn't a non-blank line then the last must be blank 4113 int outdent = Integer.MAX_VALUE; 4114 for (String line : lines) { 4115 int leadingWhitespace = line.indexOfNonWhitespace(); 4116 if (leadingWhitespace != line.length()) { 4117 outdent = Integer.min(outdent, leadingWhitespace); 4118 } 4119 } 4120 String lastLine = lines.get(lines.size() - 1); 4121 if (lastLine.isBlank()) { 4122 outdent = Integer.min(outdent, lastLine.length()); 4123 } 4124 return outdent; 4125 } 4126 4127 /** 4128 * Returns a string whose value is this string, with escape sequences 4129 * translated as if in a string literal. 4130 * <p> 4131 * Escape sequences are translated as follows; 4132 * <table class="striped"> 4133 * <caption style="display:none">Translation</caption> 4134 * <thead> 4135 * <tr> 4136 * <th scope="col">Escape</th> 4137 * <th scope="col">Name</th> 4138 * <th scope="col">Translation</th> 4139 * </tr> 4140 * </thead> 4141 * <tbody> 4142 * <tr> 4143 * <th scope="row">{@code \u005Cb}</th> 4144 * <td>backspace</td> 4145 * <td>{@code U+0008}</td> 4146 * </tr> 4147 * <tr> 4148 * <th scope="row">{@code \u005Ct}</th> 4149 * <td>horizontal tab</td> 4150 * <td>{@code U+0009}</td> 4151 * </tr> 4152 * <tr> 4153 * <th scope="row">{@code \u005Cn}</th> 4154 * <td>line feed</td> 4155 * <td>{@code U+000A}</td> 4156 * </tr> 4157 * <tr> 4158 * <th scope="row">{@code \u005Cf}</th> 4159 * <td>form feed</td> 4160 * <td>{@code U+000C}</td> 4161 * </tr> 4162 * <tr> 4163 * <th scope="row">{@code \u005Cr}</th> 4164 * <td>carriage return</td> 4165 * <td>{@code U+000D}</td> 4166 * </tr> 4167 * <tr> 4168 * <th scope="row">{@code \u005Cs}</th> 4169 * <td>space</td> 4170 * <td>{@code U+0020}</td> 4171 * </tr> 4172 * <tr> 4173 * <th scope="row">{@code \u005C"}</th> 4174 * <td>double quote</td> 4175 * <td>{@code U+0022}</td> 4176 * </tr> 4177 * <tr> 4178 * <th scope="row">{@code \u005C'}</th> 4179 * <td>single quote</td> 4180 * <td>{@code U+0027}</td> 4181 * </tr> 4182 * <tr> 4183 * <th scope="row">{@code \u005C\u005C}</th> 4184 * <td>backslash</td> 4185 * <td>{@code U+005C}</td> 4186 * </tr> 4187 * <tr> 4188 * <th scope="row">{@code \u005C0 - \u005C377}</th> 4189 * <td>octal escape</td> 4190 * <td>code point equivalents</td> 4191 * </tr> 4192 * <tr> 4193 * <th scope="row">{@code \u005C<line-terminator>}</th> 4194 * <td>continuation</td> 4195 * <td>discard</td> 4196 * </tr> 4197 * </tbody> 4198 * </table> 4199 * 4200 * @implNote 4201 * This method does <em>not</em> translate Unicode escapes such as "{@code \u005cu2022}". 4202 * Unicode escapes are translated by the Java compiler when reading input characters and 4203 * are not part of the string literal specification. 4204 * 4205 * @throws IllegalArgumentException when an escape sequence is malformed. 4206 * 4207 * @return String with escape sequences translated. 4208 * 4209 * @jls 3.10.7 Escape Sequences 4210 * 4211 * @since 15 4212 */ 4213 public String translateEscapes() { 4214 if (isEmpty()) { 4215 return ""; 4216 } 4217 char[] chars = toCharArray(); 4218 int length = chars.length; 4219 int from = 0; 4220 int to = 0; 4221 while (from < length) { 4222 char ch = chars[from++]; 4223 if (ch == '\\') { 4224 ch = from < length ? chars[from++] : '\0'; 4225 switch (ch) { 4226 case 'b': 4227 ch = '\b'; 4228 break; 4229 case 'f': 4230 ch = '\f'; 4231 break; 4232 case 'n': 4233 ch = '\n'; 4234 break; 4235 case 'r': 4236 ch = '\r'; 4237 break; 4238 case 's': 4239 ch = ' '; 4240 break; 4241 case 't': 4242 ch = '\t'; 4243 break; 4244 case '\'': 4245 case '\"': 4246 case '\\': 4247 // as is 4248 break; 4249 case '0': case '1': case '2': case '3': 4250 case '4': case '5': case '6': case '7': 4251 int limit = Integer.min(from + (ch <= '3' ? 2 : 1), length); 4252 int code = ch - '0'; 4253 while (from < limit) { 4254 ch = chars[from]; 4255 if (ch < '0' || '7' < ch) { 4256 break; 4257 } 4258 from++; 4259 code = (code << 3) | (ch - '0'); 4260 } 4261 ch = (char)code; 4262 break; 4263 case '\n': 4264 continue; 4265 case '\r': 4266 if (from < length && chars[from] == '\n') { 4267 from++; 4268 } 4269 continue; 4270 default: { 4271 String msg = String.format( 4272 "Invalid escape sequence: \\%c \\\\u%04X", 4273 ch, (int)ch); 4274 throw new IllegalArgumentException(msg); 4275 } 4276 } 4277 } 4278 4279 chars[to++] = ch; 4280 } 4281 4282 return new String(chars, 0, to); 4283 } 4284 4285 /** 4286 * This method allows the application of a function to {@code this} 4287 * string. The function should expect a single String argument 4288 * and produce an {@code R} result. 4289 * <p> 4290 * Any exception thrown by {@code f.apply()} will be propagated to the 4291 * caller. 4292 * 4293 * @param f a function to apply 4294 * 4295 * @param <R> the type of the result 4296 * 4297 * @return the result of applying the function to this string 4298 * 4299 * @see java.util.function.Function 4300 * 4301 * @since 12 4302 */ 4303 public <R> R transform(Function<? super String, ? extends R> f) { 4304 return f.apply(this); 4305 } 4306 4307 /** 4308 * This object (which is already a string!) is itself returned. 4309 * 4310 * @return the string itself. 4311 */ 4312 public String toString() { 4313 return this; 4314 } 4315 4316 /** 4317 * Returns a stream of {@code int} zero-extending the {@code char} values 4318 * from this sequence. Any char which maps to a {@linkplain 4319 * Character##unicode surrogate code point} is passed through 4320 * uninterpreted. 4321 * 4322 * @return an IntStream of char values from this sequence 4323 * @since 9 4324 */ 4325 @Override 4326 public IntStream chars() { 4327 return StreamSupport.intStream( 4328 isLatin1() ? new StringLatin1.CharsSpliterator(value, Spliterator.IMMUTABLE) 4329 : new StringUTF16.CharsSpliterator(value, Spliterator.IMMUTABLE), 4330 false); 4331 } 4332 4333 4334 /** 4335 * Returns a stream of code point values from this sequence. Any surrogate 4336 * pairs encountered in the sequence are combined as if by {@linkplain 4337 * Character#toCodePoint Character.toCodePoint} and the result is passed 4338 * to the stream. Any other code units, including ordinary BMP characters, 4339 * unpaired surrogates, and undefined code units, are zero-extended to 4340 * {@code int} values which are then passed to the stream. 4341 * 4342 * @return an IntStream of Unicode code points from this sequence 4343 * @since 9 4344 */ 4345 @Override 4346 public IntStream codePoints() { 4347 return StreamSupport.intStream( 4348 isLatin1() ? new StringLatin1.CharsSpliterator(value, Spliterator.IMMUTABLE) 4349 : new StringUTF16.CodePointsSpliterator(value, Spliterator.IMMUTABLE), 4350 false); 4351 } 4352 4353 /** 4354 * Converts this string to a new character array. 4355 * 4356 * @return a newly allocated character array whose length is the length 4357 * of this string and whose contents are initialized to contain 4358 * the character sequence represented by this string. 4359 */ 4360 public char[] toCharArray() { 4361 return isLatin1() ? StringLatin1.toChars(value) 4362 : StringUTF16.toChars(value); 4363 } 4364 4365 /** 4366 * Returns a formatted string using the specified format string and 4367 * arguments. 4368 * 4369 * <p> The locale always used is the one returned by {@link 4370 * java.util.Locale#getDefault(java.util.Locale.Category) 4371 * Locale.getDefault(Locale.Category)} with 4372 * {@link java.util.Locale.Category#FORMAT FORMAT} category specified. 4373 * 4374 * @param format 4375 * A <a href="../util/Formatter.html#syntax">format string</a> 4376 * 4377 * @param args 4378 * Arguments referenced by the format specifiers in the format 4379 * string. If there are more arguments than format specifiers, the 4380 * extra arguments are ignored. The number of arguments is 4381 * variable and may be zero. The maximum number of arguments is 4382 * limited by the maximum dimension of a Java array as defined by 4383 * <cite>The Java Virtual Machine Specification</cite>. 4384 * The behaviour on a 4385 * {@code null} argument depends on the <a 4386 * href="../util/Formatter.html#syntax">conversion</a>. 4387 * 4388 * @throws java.util.IllegalFormatException 4389 * If a format string contains an illegal syntax, a format 4390 * specifier that is incompatible with the given arguments, 4391 * insufficient arguments given the format string, or other 4392 * illegal conditions. For specification of all possible 4393 * formatting errors, see the <a 4394 * href="../util/Formatter.html#detail">Details</a> section of the 4395 * formatter class specification. 4396 * 4397 * @return A formatted string 4398 * 4399 * @see java.util.Formatter 4400 * @since 1.5 4401 */ 4402 public static String format(String format, Object... args) { 4403 return new Formatter().format(format, args).toString(); 4404 } 4405 4406 /** 4407 * Returns a formatted string using the specified locale, format string, 4408 * and arguments. 4409 * 4410 * @param l 4411 * The {@linkplain java.util.Locale locale} to apply during 4412 * formatting. If {@code l} is {@code null} then no localization 4413 * is applied. 4414 * 4415 * @param format 4416 * A <a href="../util/Formatter.html#syntax">format string</a> 4417 * 4418 * @param args 4419 * Arguments referenced by the format specifiers in the format 4420 * string. If there are more arguments than format specifiers, the 4421 * extra arguments are ignored. The number of arguments is 4422 * variable and may be zero. The maximum number of arguments is 4423 * limited by the maximum dimension of a Java array as defined by 4424 * <cite>The Java Virtual Machine Specification</cite>. 4425 * The behaviour on a 4426 * {@code null} argument depends on the 4427 * <a href="../util/Formatter.html#syntax">conversion</a>. 4428 * 4429 * @throws java.util.IllegalFormatException 4430 * If a format string contains an illegal syntax, a format 4431 * specifier that is incompatible with the given arguments, 4432 * insufficient arguments given the format string, or other 4433 * illegal conditions. For specification of all possible 4434 * formatting errors, see the <a 4435 * href="../util/Formatter.html#detail">Details</a> section of the 4436 * formatter class specification 4437 * 4438 * @return A formatted string 4439 * 4440 * @see java.util.Formatter 4441 * @since 1.5 4442 */ 4443 public static String format(Locale l, String format, Object... args) { 4444 return new Formatter(l).format(format, args).toString(); 4445 } 4446 4447 /** 4448 * Formats using this string as the format string, and the supplied 4449 * arguments. 4450 * 4451 * @implSpec This method is equivalent to {@code String.format(this, args)}. 4452 * 4453 * @param args 4454 * Arguments referenced by the format specifiers in this string. 4455 * 4456 * @return A formatted string 4457 * 4458 * @see java.lang.String#format(String,Object...) 4459 * @see java.util.Formatter 4460 * 4461 * @since 15 4462 * 4463 */ 4464 public String formatted(Object... args) { 4465 return new Formatter().format(this, args).toString(); 4466 } 4467 4468 /** 4469 * Returns the string representation of the {@code Object} argument. 4470 * 4471 * @param obj an {@code Object}. 4472 * @return if the argument is {@code null}, then a string equal to 4473 * {@code "null"}; otherwise, the value of 4474 * {@code obj.toString()} is returned. 4475 * @see java.lang.Object#toString() 4476 */ 4477 public static String valueOf(Object obj) { 4478 return (obj == null) ? "null" : obj.toString(); 4479 } 4480 4481 /** 4482 * Returns the string representation of the {@code char} array 4483 * argument. The contents of the character array are copied; subsequent 4484 * modification of the character array does not affect the returned 4485 * string. 4486 * 4487 * @param data the character array. 4488 * @return a {@code String} that contains the characters of the 4489 * character array. 4490 */ 4491 public static String valueOf(char[] data) { 4492 return new String(data); 4493 } 4494 4495 /** 4496 * Returns the string representation of a specific subarray of the 4497 * {@code char} array argument. 4498 * <p> 4499 * The {@code offset} argument is the index of the first 4500 * character of the subarray. The {@code count} argument 4501 * specifies the length of the subarray. The contents of the subarray 4502 * are copied; subsequent modification of the character array does not 4503 * affect the returned string. 4504 * 4505 * @param data the character array. 4506 * @param offset initial offset of the subarray. 4507 * @param count length of the subarray. 4508 * @return a {@code String} that contains the characters of the 4509 * specified subarray of the character array. 4510 * @throws IndexOutOfBoundsException if {@code offset} is 4511 * negative, or {@code count} is negative, or 4512 * {@code offset+count} is larger than 4513 * {@code data.length}. 4514 */ 4515 public static String valueOf(char[] data, int offset, int count) { 4516 return new String(data, offset, count); 4517 } 4518 4519 /** 4520 * Equivalent to {@link #valueOf(char[], int, int)}. 4521 * 4522 * @param data the character array. 4523 * @param offset initial offset of the subarray. 4524 * @param count length of the subarray. 4525 * @return a {@code String} that contains the characters of the 4526 * specified subarray of the character array. 4527 * @throws IndexOutOfBoundsException if {@code offset} is 4528 * negative, or {@code count} is negative, or 4529 * {@code offset+count} is larger than 4530 * {@code data.length}. 4531 */ 4532 public static String copyValueOf(char[] data, int offset, int count) { 4533 return new String(data, offset, count); 4534 } 4535 4536 /** 4537 * Equivalent to {@link #valueOf(char[])}. 4538 * 4539 * @param data the character array. 4540 * @return a {@code String} that contains the characters of the 4541 * character array. 4542 */ 4543 public static String copyValueOf(char[] data) { 4544 return new String(data); 4545 } 4546 4547 /** 4548 * Returns the string representation of the {@code boolean} argument. 4549 * 4550 * @param b a {@code boolean}. 4551 * @return if the argument is {@code true}, a string equal to 4552 * {@code "true"} is returned; otherwise, a string equal to 4553 * {@code "false"} is returned. 4554 */ 4555 public static String valueOf(boolean b) { 4556 return b ? "true" : "false"; 4557 } 4558 4559 /** 4560 * Returns the string representation of the {@code char} 4561 * argument. 4562 * 4563 * @param c a {@code char}. 4564 * @return a string of length {@code 1} containing 4565 * as its single character the argument {@code c}. 4566 */ 4567 public static String valueOf(char c) { 4568 if (COMPACT_STRINGS && StringLatin1.canEncode(c)) { 4569 return new String(StringLatin1.toBytes(c), LATIN1); 4570 } 4571 return new String(StringUTF16.toBytes(c), UTF16); 4572 } 4573 4574 /** 4575 * Returns the string representation of the {@code int} argument. 4576 * <p> 4577 * The representation is exactly the one returned by the 4578 * {@code Integer.toString} method of one argument. 4579 * 4580 * @param i an {@code int}. 4581 * @return a string representation of the {@code int} argument. 4582 * @see java.lang.Integer#toString(int, int) 4583 */ 4584 public static String valueOf(int i) { 4585 return Integer.toString(i); 4586 } 4587 4588 /** 4589 * Returns the string representation of the {@code long} argument. 4590 * <p> 4591 * The representation is exactly the one returned by the 4592 * {@code Long.toString} method of one argument. 4593 * 4594 * @param l a {@code long}. 4595 * @return a string representation of the {@code long} argument. 4596 * @see java.lang.Long#toString(long) 4597 */ 4598 public static String valueOf(long l) { 4599 return Long.toString(l); 4600 } 4601 4602 /** 4603 * Returns the string representation of the {@code float} argument. 4604 * <p> 4605 * The representation is exactly the one returned by the 4606 * {@code Float.toString} method of one argument. 4607 * 4608 * @param f a {@code float}. 4609 * @return a string representation of the {@code float} argument. 4610 * @see java.lang.Float#toString(float) 4611 */ 4612 public static String valueOf(float f) { 4613 return Float.toString(f); 4614 } 4615 4616 /** 4617 * Returns the string representation of the {@code double} argument. 4618 * <p> 4619 * The representation is exactly the one returned by the 4620 * {@code Double.toString} method of one argument. 4621 * 4622 * @param d a {@code double}. 4623 * @return a string representation of the {@code double} argument. 4624 * @see java.lang.Double#toString(double) 4625 */ 4626 public static String valueOf(double d) { 4627 return Double.toString(d); 4628 } 4629 4630 /** 4631 * Returns a canonical representation for the string object. 4632 * <p> 4633 * A pool of strings, initially empty, is maintained privately by the 4634 * class {@code String}. 4635 * <p> 4636 * When the intern method is invoked, if the pool already contains a 4637 * string equal to this {@code String} object as determined by 4638 * the {@link #equals(Object)} method, then the string from the pool is 4639 * returned. Otherwise, this {@code String} object is added to the 4640 * pool and a reference to this {@code String} object is returned. 4641 * <p> 4642 * It follows that for any two strings {@code s} and {@code t}, 4643 * {@code s.intern() == t.intern()} is {@code true} 4644 * if and only if {@code s.equals(t)} is {@code true}. 4645 * <p> 4646 * All literal strings and string-valued constant expressions are 4647 * interned. String literals are defined in section {@jls 3.10.5} of the 4648 * <cite>The Java Language Specification</cite>. 4649 * 4650 * @return a string that has the same contents as this string, but is 4651 * guaranteed to be from a pool of unique strings. 4652 */ 4653 public native String intern(); 4654 4655 /** 4656 * Returns a string whose value is the concatenation of this 4657 * string repeated {@code count} times. 4658 * <p> 4659 * If this string is empty or count is zero then the empty 4660 * string is returned. 4661 * 4662 * @param count number of times to repeat 4663 * 4664 * @return A string composed of this string repeated 4665 * {@code count} times or the empty string if this 4666 * string is empty or count is zero 4667 * 4668 * @throws IllegalArgumentException if the {@code count} is 4669 * negative. 4670 * 4671 * @since 11 4672 */ 4673 public String repeat(int count) { 4674 if (count < 0) { 4675 throw new IllegalArgumentException("count is negative: " + count); 4676 } 4677 if (count == 1) { 4678 return this; 4679 } 4680 final int len = value.length; 4681 if (len == 0 || count == 0) { 4682 return ""; 4683 } 4684 if (Integer.MAX_VALUE / count < len) { 4685 throw new OutOfMemoryError("Required length exceeds implementation limit"); 4686 } 4687 if (len == 1) { 4688 final byte[] single = new byte[count]; 4689 Arrays.fill(single, value[0]); 4690 return new String(single, coder); 4691 } 4692 final int limit = len * count; 4693 final byte[] multiple = new byte[limit]; 4694 System.arraycopy(value, 0, multiple, 0, len); 4695 repeatCopyRest(multiple, 0, limit, len); 4696 return new String(multiple, coder); 4697 } 4698 4699 /** 4700 * Used to perform copying after the initial insertion. Copying is optimized 4701 * by using power of two duplication. First pass duplicates original copy, 4702 * second pass then duplicates the original and the copy yielding four copies, 4703 * third pass duplicates four copies yielding eight copies, and so on. 4704 * Finally, the remainder is filled in with prior copies. 4705 * 4706 * @implNote The technique used here is significantly faster than hand-rolled 4707 * loops or special casing small numbers due to the intensive optimization 4708 * done by intrinsic {@code System.arraycopy}. 4709 * 4710 * @param buffer destination buffer 4711 * @param offset offset in the destination buffer 4712 * @param limit total replicated including what is already in the buffer 4713 * @param copied number of bytes that have already in the buffer 4714 */ 4715 static void repeatCopyRest(byte[] buffer, int offset, int limit, int copied) { 4716 // Initial copy is in the buffer. 4717 for (; copied < limit - copied; copied <<= 1) { 4718 // Power of two duplicate. 4719 System.arraycopy(buffer, offset, buffer, offset + copied, copied); 4720 } 4721 // Duplicate remainder. 4722 System.arraycopy(buffer, offset, buffer, offset + copied, limit - copied); 4723 } 4724 4725 //////////////////////////////////////////////////////////////// 4726 4727 /** 4728 * Copy character bytes from this string into dst starting at dstBegin. 4729 * This method doesn't perform any range checking. 4730 * 4731 * Invoker guarantees: dst is in UTF16 (inflate itself for asb), if two 4732 * coders are different, and dst is big enough (range check) 4733 * 4734 * @param dstBegin the char index, not offset of byte[] 4735 * @param coder the coder of dst[] 4736 */ 4737 void getBytes(byte[] dst, int dstBegin, byte coder) { 4738 if (coder() == coder) { 4739 System.arraycopy(value, 0, dst, dstBegin << coder, value.length); 4740 } else { // this.coder == LATIN && coder == UTF16 4741 StringLatin1.inflate(value, 0, dst, dstBegin, value.length); 4742 } 4743 } 4744 4745 /** 4746 * Copy character bytes from this string into dst starting at dstBegin. 4747 * This method doesn't perform any range checking. 4748 * 4749 * Invoker guarantees: dst is in UTF16 (inflate itself for asb), if two 4750 * coders are different, and dst is big enough (range check) 4751 * 4752 * @param srcPos the char index, not offset of byte[] 4753 * @param dstBegin the char index to start from 4754 * @param coder the coder of dst[] 4755 * @param length the amount of copied chars 4756 */ 4757 void getBytes(byte[] dst, int srcPos, int dstBegin, byte coder, int length) { 4758 if (coder() == coder) { 4759 System.arraycopy(value, srcPos << coder, dst, dstBegin << coder, length << coder); 4760 } else { // this.coder == LATIN && coder == UTF16 4761 StringLatin1.inflate(value, srcPos, dst, dstBegin, length); 4762 } 4763 } 4764 4765 /* 4766 * Package private constructor. Trailing Void argument is there for 4767 * disambiguating it against other (public) constructors. 4768 * 4769 * Stores the char[] value into a byte[] that each byte represents 4770 * the8 low-order bits of the corresponding character, if the char[] 4771 * contains only latin1 character. Or a byte[] that stores all 4772 * characters in their byte sequences defined by the {@code StringUTF16}. 4773 */ 4774 String(char[] value, int off, int len, Void sig) { 4775 if (len == 0) { 4776 this.value = "".value; 4777 this.coder = "".coder; 4778 return; 4779 } 4780 if (COMPACT_STRINGS) { 4781 byte[] val = StringUTF16.compress(value, off, len); 4782 if (val != null) { 4783 this.value = val; 4784 this.coder = LATIN1; 4785 return; 4786 } 4787 } 4788 this.coder = UTF16; 4789 this.value = StringUTF16.toBytes(value, off, len); 4790 } 4791 4792 /* 4793 * Package private constructor. Trailing Void argument is there for 4794 * disambiguating it against other (public) constructors. 4795 */ 4796 String(AbstractStringBuilder asb, Void sig) { 4797 byte[] val = asb.getValue(); 4798 int length = asb.length(); 4799 if (asb.isLatin1()) { 4800 this.coder = LATIN1; 4801 this.value = Arrays.copyOfRange(val, 0, length); 4802 } else { 4803 // only try to compress val if some characters were deleted. 4804 if (COMPACT_STRINGS && asb.maybeLatin1) { 4805 byte[] buf = StringUTF16.compress(val, 0, length); 4806 if (buf != null) { 4807 this.coder = LATIN1; 4808 this.value = buf; 4809 return; 4810 } 4811 } 4812 this.coder = UTF16; 4813 this.value = Arrays.copyOfRange(val, 0, length << 1); 4814 } 4815 } 4816 4817 /* 4818 * Package private constructor which shares value array for speed. 4819 */ 4820 String(byte[] value, byte coder) { 4821 this.value = value; 4822 this.coder = coder; 4823 } 4824 4825 byte coder() { 4826 return COMPACT_STRINGS ? coder : UTF16; 4827 } 4828 4829 byte[] value() { 4830 return value; 4831 } 4832 4833 boolean isLatin1() { 4834 return COMPACT_STRINGS && coder == LATIN1; 4835 } 4836 4837 @Native static final byte LATIN1 = 0; 4838 @Native static final byte UTF16 = 1; 4839 4840 /* 4841 * StringIndexOutOfBoundsException if {@code index} is 4842 * negative or greater than or equal to {@code length}. 4843 */ 4844 static void checkIndex(int index, int length) { 4845 Preconditions.checkIndex(index, length, Preconditions.SIOOBE_FORMATTER); 4846 } 4847 4848 /* 4849 * StringIndexOutOfBoundsException if {@code offset} 4850 * is negative or greater than {@code length}. 4851 */ 4852 static void checkOffset(int offset, int length) { 4853 Preconditions.checkFromToIndex(offset, length, length, Preconditions.SIOOBE_FORMATTER); 4854 } 4855 4856 /* 4857 * Check {@code offset}, {@code count} against {@code 0} and {@code length} 4858 * bounds. 4859 * 4860 * @return {@code offset} if the sub-range within bounds of the range 4861 * @throws StringIndexOutOfBoundsException 4862 * If {@code offset} is negative, {@code count} is negative, 4863 * or {@code offset} is greater than {@code length - count} 4864 */ 4865 static int checkBoundsOffCount(int offset, int count, int length) { 4866 return Preconditions.checkFromIndexSize(offset, count, length, Preconditions.SIOOBE_FORMATTER); 4867 } 4868 4869 /* 4870 * Check {@code begin}, {@code end} against {@code 0} and {@code length} 4871 * bounds. 4872 * 4873 * @throws StringIndexOutOfBoundsException 4874 * If {@code begin} is negative, {@code begin} is greater than 4875 * {@code end}, or {@code end} is greater than {@code length}. 4876 */ 4877 static void checkBoundsBeginEnd(int begin, int end, int length) { 4878 Preconditions.checkFromToIndex(begin, end, length, Preconditions.SIOOBE_FORMATTER); 4879 } 4880 4881 /** 4882 * Returns the string representation of the {@code codePoint} 4883 * argument. 4884 * 4885 * @param codePoint a {@code codePoint}. 4886 * @return a string of length {@code 1} or {@code 2} containing 4887 * as its single character the argument {@code codePoint}. 4888 * @throws IllegalArgumentException if the specified 4889 * {@code codePoint} is not a {@linkplain Character#isValidCodePoint 4890 * valid Unicode code point}. 4891 */ 4892 static String valueOfCodePoint(int codePoint) { 4893 if (COMPACT_STRINGS && StringLatin1.canEncode(codePoint)) { 4894 return new String(StringLatin1.toBytes((char)codePoint), LATIN1); 4895 } else if (Character.isBmpCodePoint(codePoint)) { 4896 return new String(StringUTF16.toBytes((char)codePoint), UTF16); 4897 } else if (Character.isSupplementaryCodePoint(codePoint)) { 4898 return new String(StringUTF16.toBytesSupplementary(codePoint), UTF16); 4899 } 4900 4901 throw new IllegalArgumentException( 4902 format("Not a valid Unicode code point: 0x%X", codePoint)); 4903 } 4904 4905 /** 4906 * Returns an {@link Optional} containing the nominal descriptor for this 4907 * instance, which is the instance itself. 4908 * 4909 * @return an {@link Optional} describing the {@linkplain String} instance 4910 * @since 12 4911 */ 4912 @Override 4913 public Optional<String> describeConstable() { 4914 return Optional.of(this); 4915 } 4916 4917 /** 4918 * Resolves this instance as a {@link ConstantDesc}, the result of which is 4919 * the instance itself. 4920 * 4921 * @param lookup ignored 4922 * @return the {@linkplain String} instance 4923 * @since 12 4924 */ 4925 @Override 4926 public String resolveConstantDesc(MethodHandles.Lookup lookup) { 4927 return this; 4928 } 4929 4930 }