1 /*
   2  * Copyright (c) 1996, 2022, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 
  24 /*
  25  * @test
  26  * @bug 4035266 4052418 4068133 4068137 4068139 4086052 4095322 4097779
  27  *      4097920 4098467 4111338 4113835 4117554 4143071 4146175 4152117
  28  *      4152416 4153072 4158381 4214367 4217703 4638433 8264765 8291660
  29  *      8294008
  30  * @library /java/text/testlib
  31  * @run main/timeout=2000 BreakIteratorTest
  32  * @summary test BreakIterator
  33  */
  34 
  35 /*
  36  * This file is available under and governed by the GNU General Public
  37  * License version 2 only, as published by the Free Software Foundation.
  38  * However, the following notice accompanied the original version of this
  39  * file and, per its terms, should not be removed:
  40  *
  41  * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
  42  * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
  43  *
  44  * Portions copyright (c) 2007 Sun Microsystems, Inc.
  45  * All Rights Reserved.
  46  *
  47  * The original version of this source code and documentation
  48  * is copyrighted and owned by Taligent, Inc., a wholly-owned
  49  * subsidiary of IBM. These materials are provided under terms
  50  * of a License Agreement between Taligent and Sun. This technology
  51  * is protected by multiple US and International patents.
  52  *
  53  * This notice and attribution to Taligent may not be removed.
  54  * Taligent is a registered trademark of Taligent, Inc.
  55  *
  56  * Permission to use, copy, modify, and distribute this software
  57  * and its documentation for NON-COMMERCIAL purposes and without
  58  * fee is hereby granted provided that this copyright notice
  59  * appears in all copies. Please refer to the file "copyright.html"
  60  * for further important copyright and licensing information.
  61  *
  62  * SUN MAKES NO REPRESENTATIONS OR WARRANTIES ABOUT THE SUITABILITY OF
  63  * THE SOFTWARE, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
  64  * TO THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
  65  * PARTICULAR PURPOSE, OR NON-INFRINGEMENT. SUN SHALL NOT BE LIABLE FOR
  66  * ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR
  67  * DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES.
  68  *
  69  */
  70 
  71 import java.nio.file.Files;
  72 import java.nio.file.Paths;
  73 import java.text.BreakIterator;
  74 import java.text.StringCharacterIterator;
  75 import java.util.Arrays;
  76 import java.util.Locale;
  77 import java.util.Vector;
  78 import java.util.function.Predicate;
  79 import java.util.regex.Pattern;
  80 
  81 public class BreakIteratorTest extends IntlTest
  82 {
  83     private BreakIterator characterBreak;
  84     private BreakIterator wordBreak;
  85     private BreakIterator lineBreak;
  86     private BreakIterator sentenceBreak;
  87 
  88     public static void main(String[] args) throws Exception {
  89         new BreakIteratorTest().run(args);
  90     }
  91 
  92     public BreakIteratorTest()
  93     {
  94         characterBreak = BreakIterator.getCharacterInstance();
  95         wordBreak = BreakIterator.getWordInstance();
  96         lineBreak = BreakIterator.getLineInstance();
  97         sentenceBreak = BreakIterator.getSentenceInstance();
  98     }
  99 
 100     //=========================================================================
 101     // general test subroutines
 102     //=========================================================================
 103 
 104     private void generalIteratorTest(BreakIterator bi, Vector expectedResult) {
 105         StringBuffer buffer = new StringBuffer();
 106         String text;
 107         for (int i = 0; i < expectedResult.size(); i++) {
 108             text = (String)expectedResult.elementAt(i);
 109             buffer.append(text);
 110         }
 111         text = buffer.toString();
 112 
 113         bi.setText(text);
 114 
 115         Vector nextResults = testFirstAndNext(bi, text);
 116         Vector previousResults = testLastAndPrevious(bi, text);
 117 
 118         logln("comparing forward and backward...");
 119         int errs = getErrorCount();
 120         compareFragmentLists("forward iteration", "backward iteration", nextResults,
 121                         previousResults);
 122         if (getErrorCount() == errs) {
 123             logln("comparing expected and actual...");
 124             compareFragmentLists("expected result", "actual result", expectedResult,
 125                             nextResults);
 126         }
 127 
 128         int[] boundaries = new int[expectedResult.size() + 3];
 129         boundaries[0] = BreakIterator.DONE;
 130         boundaries[1] = 0;
 131         for (int i = 0; i < expectedResult.size(); i++)
 132             boundaries[i + 2] = boundaries[i + 1] + ((String)expectedResult.elementAt(i)).
 133                             length();
 134         boundaries[boundaries.length - 1] = BreakIterator.DONE;
 135 
 136         testFollowing(bi, text, boundaries);
 137         testPreceding(bi, text, boundaries);
 138         testIsBoundary(bi, text, boundaries);
 139 
 140         doMultipleSelectionTest(bi, text);
 141     }
 142 
 143     private Vector testFirstAndNext(BreakIterator bi, String text) {
 144         int p = bi.first();
 145         int lastP = p;
 146         Vector<String> result = new Vector<String>();
 147 
 148         if (p != 0)
 149             errln("first() returned " + p + " instead of 0");
 150         while (p != BreakIterator.DONE) {
 151             p = bi.next();
 152             if (p != BreakIterator.DONE) {
 153                 if (p <= lastP)
 154                     errln("next() failed to move forward: next() on position "
 155                                     + lastP + " yielded " + p);
 156 
 157                 result.addElement(text.substring(lastP, p));
 158             }
 159             else {
 160                 if (lastP != text.length())
 161                     errln("next() returned DONE prematurely: offset was "
 162                                     + lastP + " instead of " + text.length());
 163             }
 164             lastP = p;
 165         }
 166         return result;
 167     }
 168 
 169     private Vector testLastAndPrevious(BreakIterator bi, String text) {
 170         int p = bi.last();
 171         int lastP = p;
 172         Vector<String> result = new Vector<String>();
 173 
 174         if (p != text.length())
 175             errln("last() returned " + p + " instead of " + text.length());
 176         while (p != BreakIterator.DONE) {
 177             p = bi.previous();
 178             if (p != BreakIterator.DONE) {
 179                 if (p >= lastP)
 180                     errln("previous() failed to move backward: previous() on position "
 181                                     + lastP + " yielded " + p);
 182 
 183                 result.insertElementAt(text.substring(p, lastP), 0);
 184             }
 185             else {
 186                 if (lastP != 0)
 187                     errln("previous() returned DONE prematurely: offset was "
 188                                     + lastP + " instead of 0");
 189             }
 190             lastP = p;
 191         }
 192         return result;
 193     }
 194 
 195     private void compareFragmentLists(String f1Name, String f2Name, Vector f1, Vector f2) {
 196         int p1 = 0;
 197         int p2 = 0;
 198         String s1;
 199         String s2;
 200         int t1 = 0;
 201         int t2 = 0;
 202 
 203         while (p1 < f1.size() && p2 < f2.size()) {
 204             s1 = (String)f1.elementAt(p1);
 205             s2 = (String)f2.elementAt(p2);
 206             t1 += s1.length();
 207             t2 += s2.length();
 208 
 209             if (s1.equals(s2)) {
 210                 debugLogln("   >" + s1 + "<");
 211                 ++p1;
 212                 ++p2;
 213             }
 214             else {
 215                 int tempT1 = t1;
 216                 int tempT2 = t2;
 217                 int tempP1 = p1;
 218                 int tempP2 = p2;
 219 
 220                 while (tempT1 != tempT2 && tempP1 < f1.size() && tempP2 < f2.size()) {
 221                     while (tempT1 < tempT2 && tempP1 < f1.size()) {
 222                         tempT1 += ((String)f1.elementAt(tempP1)).length();
 223                         ++tempP1;
 224                     }
 225                     while (tempT2 < tempT1 && tempP2 < f2.size()) {
 226                         tempT2 += ((String)f2.elementAt(tempP2)).length();
 227                         ++tempP2;
 228                     }
 229                 }
 230                 logln("*** " + f1Name + " has:");
 231                 while (p1 <= tempP1 && p1 < f1.size()) {
 232                     s1 = (String)f1.elementAt(p1);
 233                     t1 += s1.length();
 234                     debugLogln(" *** >" + s1 + "<");
 235                     ++p1;
 236                 }
 237                 logln("***** " + f2Name + " has:");
 238                 while (p2 <= tempP2 && p2 < f2.size()) {
 239                     s2 = (String)f2.elementAt(p2);
 240                     t2 += s2.length();
 241                     debugLogln(" ***** >" + s2 + "<");
 242                     ++p2;
 243                 }
 244                 errln("Discrepancy between " + f1Name + " and " + f2Name + "\n---\n" + f1 +"\n---\n" + f2);
 245             }
 246         }
 247     }
 248 
 249     private void testFollowing(BreakIterator bi, String text, int[] boundaries) {
 250         logln("testFollowing():");
 251         int p = 2;
 252         int i = 0;
 253         try {
 254             for (i = 0; i <= text.length(); i++) {  // change to <= when new BI code goes in
 255                 if (i == boundaries[p])
 256                     ++p;
 257 
 258                 int b = bi.following(i);
 259                 logln("bi.following(" + i + ") -> " + b);
 260                 if (b != boundaries[p])
 261                     errln("Wrong result from following() for " + i + ": expected " + boundaries[p]
 262                           + ", got " + b);
 263             }
 264         } catch (IllegalArgumentException illargExp) {
 265             errln("IllegalArgumentException caught from following() for offset: " + i);
 266         }
 267     }
 268 
 269     private void testPreceding(BreakIterator bi, String text, int[] boundaries) {
 270         logln("testPreceding():");
 271         int p = 0;
 272         int i = 0;
 273         try {
 274             for (i = 0; i <= text.length(); i++) {  // change to <= when new BI code goes in
 275                 int b = bi.preceding(i);
 276                 logln("bi.preceding(" + i + ") -> " + b);
 277                 if (b != boundaries[p])
 278                     errln("Wrong result from preceding() for " + i + ": expected " + boundaries[p]
 279                           + ", got " + b);
 280 
 281                 if (i == boundaries[p + 1])
 282                     ++p;
 283             }
 284         } catch (IllegalArgumentException illargExp) {
 285             errln("IllegalArgumentException caught from preceding() for offset: " + i);
 286         }
 287     }
 288 
 289     private void testIsBoundary(BreakIterator bi, String text, int[] boundaries) {
 290         logln("testIsBoundary():");
 291         int p = 1;
 292         boolean isB;
 293         for (int i = 0; i <= text.length(); i++) {  // change to <= when new BI code goes in
 294             isB = bi.isBoundary(i);
 295             logln("bi.isBoundary(" + i + ") -> " + isB);
 296 
 297             if (i == boundaries[p]) {
 298                 if (!isB)
 299                     errln("Wrong result from isBoundary() for " + i + ": expected true, got false");
 300                 ++p;
 301             }
 302             else {
 303                 if (isB)
 304                     errln("Wrong result from isBoundary() for " + i + ": expected false, got true");
 305             }
 306         }
 307     }
 308 
 309     private void doMultipleSelectionTest(BreakIterator iterator, String testText)
 310     {
 311         logln("Multiple selection test...");
 312         BreakIterator testIterator = (BreakIterator)iterator.clone();
 313         int offset = iterator.first();
 314         int testOffset;
 315         int count = 0;
 316 
 317         do {
 318             testOffset = testIterator.first();
 319             testOffset = testIterator.next(count);
 320             logln("next(" + count + ") -> " + testOffset);
 321             if (offset != testOffset)
 322                 errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 323 
 324             if (offset != BreakIterator.DONE) {
 325                 count++;
 326                 offset = iterator.next();
 327             }
 328         } while (offset != BreakIterator.DONE);
 329 
 330         // now do it backwards...
 331         offset = iterator.last();
 332         count = 0;
 333 
 334         do {
 335             testOffset = testIterator.last();
 336             testOffset = testIterator.next(count);
 337             logln("next(" + count + ") -> " + testOffset);
 338             if (offset != testOffset)
 339                 errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 340 
 341             if (offset != BreakIterator.DONE) {
 342                 count--;
 343                 offset = iterator.previous();
 344             }
 345         } while (offset != BreakIterator.DONE);
 346     }
 347 
 348     private void doBreakInvariantTest(BreakIterator tb, String testChars)
 349     {
 350         StringBuffer work = new StringBuffer("aaa");
 351         int errorCount = 0;
 352 
 353         // a break should always occur after CR (unless followed by LF), LF, PS, and LS
 354         String breaks = /*"\r\n\u2029\u2028"*/"\n\u2029\u2028";
 355                             // change this back when new BI code is added
 356 
 357         for (int i = 0; i < breaks.length(); i++) {
 358             work.setCharAt(1, breaks.charAt(i));
 359             for (int j = 0; j < testChars.length(); j++) {
 360                 work.setCharAt(0, testChars.charAt(j));
 361                 for (int k = 0; k < testChars.length(); k++) {
 362                     char c = testChars.charAt(k);
 363 
 364                     // if a cr is followed by lf, don't do the check (they stay together)
 365                     if (work.charAt(1) == '\r' && (c == '\n'))
 366                         continue;
 367 
 368                     // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
 369                     // for breaking purposes as per UTR14
 370                     int type1 = Character.getType(work.charAt(1));
 371                     int type2 = Character.getType(c);
 372                     if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
 373                         type2 == Character.CONTROL || type2 == Character.FORMAT) {
 374                         continue;
 375                     }
 376 
 377                     work.setCharAt(2, c);
 378                     tb.setText(work.toString());
 379                     boolean seen2 = false;
 380                     for (int l = tb.first(); l != BreakIterator.DONE; l = tb.next()) {
 381                         if (l == 2)
 382                             seen2 = true;
 383                     }
 384                     if (!seen2) {
 385                         errln("No break between U+" + Integer.toHexString((int)(work.charAt(1)))
 386                                     + " and U+" + Integer.toHexString((int)(work.charAt(2))));
 387                         errorCount++;
 388                         if (errorCount >= 75)
 389                             return;
 390                     }
 391                 }
 392             }
 393         }
 394     }
 395 
 396     private void doOtherInvariantTest(BreakIterator tb, String testChars)
 397     {
 398         StringBuffer work = new StringBuffer("a\r\na");
 399         int errorCount = 0;
 400 
 401         // a break should never occur between CR and LF
 402         for (int i = 0; i < testChars.length(); i++) {
 403             work.setCharAt(0, testChars.charAt(i));
 404             for (int j = 0; j < testChars.length(); j++) {
 405                 work.setCharAt(3, testChars.charAt(j));
 406                 tb.setText(work.toString());
 407                 for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
 408                     if (k == 2) {
 409                         errln("Break between CR and LF in string U+" + Integer.toHexString(
 410                                 (int)(work.charAt(0))) + ", U+d U+a U+" + Integer.toHexString(
 411                                 (int)(work.charAt(3))));
 412                         errorCount++;
 413                         if (errorCount >= 75)
 414                             return;
 415                     }
 416             }
 417         }
 418 
 419         // a break should never occur before a non-spacing mark, unless it's preceded
 420         // by a line terminator
 421         work.setLength(0);
 422         work.append("aaaa");
 423         for (int i = 0; i < testChars.length(); i++) {
 424             char c = testChars.charAt(i);
 425             if (c == '\n' || c == '\r' || c == '\u2029' || c == '\u2028' || c == '\u0003')
 426                 continue;
 427             work.setCharAt(1, c);
 428             for (int j = 0; j < testChars.length(); j++) {
 429                 c = testChars.charAt(j);
 430                 if (Character.getType(c) != Character.NON_SPACING_MARK && Character.getType(c)
 431                         != Character.ENCLOSING_MARK)
 432                     continue;
 433                 work.setCharAt(2, c);
 434 
 435                 // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
 436                 // for breaking purposes as per UTR14
 437                 int type1 = Character.getType(work.charAt(1));
 438                 int type2 = Character.getType(work.charAt(2));
 439                 if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
 440                     type2 == Character.CONTROL || type2 == Character.FORMAT) {
 441                     continue;
 442                 }
 443 
 444                 tb.setText(work.toString());
 445                 for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
 446                     if (k == 2) {
 447                         errln("Break between U+" + Integer.toHexString((int)(work.charAt(1)))
 448                                 + " and U+" + Integer.toHexString((int)(work.charAt(2))));
 449                         errorCount++;
 450                         if (errorCount >= 75)
 451                             return;
 452                     }
 453             }
 454         }
 455     }
 456 
 457     public void debugLogln(String s) {
 458         final String zeros = "0000";
 459         String temp;
 460         StringBuffer out = new StringBuffer();
 461         for (int i = 0; i < s.length(); i++) {
 462             char c = s.charAt(i);
 463             if (c >= ' ' && c < '\u007f')
 464                 out.append(c);
 465             else {
 466                 out.append("\\u");
 467                 temp = Integer.toHexString((int)c);
 468                 out.append(zeros.substring(0, 4 - temp.length()));
 469                 out.append(temp);
 470             }
 471         }
 472         logln(out.toString());
 473     }
 474 
 475     //=========================================================================
 476     // tests
 477     //=========================================================================
 478 
 479     public void TestWordBreak() {
 480 
 481         Vector<String> wordSelectionData = new Vector<String>();
 482 
 483         wordSelectionData.addElement("12,34");
 484 
 485         wordSelectionData.addElement(" ");
 486         wordSelectionData.addElement("\u00A2"); //cent sign
 487         wordSelectionData.addElement("\u00A3"); //pound sign
 488         wordSelectionData.addElement("\u00A4"); //currency sign
 489         wordSelectionData.addElement("\u00A5"); //yen sign
 490         wordSelectionData.addElement("alpha-beta-gamma");
 491         wordSelectionData.addElement(".");
 492         wordSelectionData.addElement(" ");
 493         wordSelectionData.addElement("Badges");
 494         wordSelectionData.addElement("?");
 495         wordSelectionData.addElement(" ");
 496         wordSelectionData.addElement("BADGES");
 497         wordSelectionData.addElement("!");
 498         wordSelectionData.addElement("?");
 499         wordSelectionData.addElement("!");
 500         wordSelectionData.addElement(" ");
 501         wordSelectionData.addElement("We");
 502         wordSelectionData.addElement(" ");
 503         wordSelectionData.addElement("don't");
 504         wordSelectionData.addElement(" ");
 505         wordSelectionData.addElement("need");
 506         wordSelectionData.addElement(" ");
 507         wordSelectionData.addElement("no");
 508         wordSelectionData.addElement(" ");
 509         wordSelectionData.addElement("STINKING");
 510         wordSelectionData.addElement(" ");
 511         wordSelectionData.addElement("BADGES");
 512         wordSelectionData.addElement("!");
 513         wordSelectionData.addElement("!");
 514         wordSelectionData.addElement("!");
 515 
 516         wordSelectionData.addElement("012.566,5");
 517         wordSelectionData.addElement(" ");
 518         wordSelectionData.addElement("123.3434,900");
 519         wordSelectionData.addElement(" ");
 520         wordSelectionData.addElement("1000,233,456.000");
 521         wordSelectionData.addElement(" ");
 522         wordSelectionData.addElement("1,23.322%");
 523         wordSelectionData.addElement(" ");
 524         wordSelectionData.addElement("123.1222");
 525 
 526         wordSelectionData.addElement(" ");
 527         wordSelectionData.addElement("\u0024123,000.20");
 528 
 529         wordSelectionData.addElement(" ");
 530         wordSelectionData.addElement("179.01\u0025");
 531 
 532         wordSelectionData.addElement("Hello");
 533         wordSelectionData.addElement(",");
 534         wordSelectionData.addElement(" ");
 535         wordSelectionData.addElement("how");
 536         wordSelectionData.addElement(" ");
 537         wordSelectionData.addElement("are");
 538         wordSelectionData.addElement(" ");
 539         wordSelectionData.addElement("you");
 540         wordSelectionData.addElement(" ");
 541         wordSelectionData.addElement("X");
 542         wordSelectionData.addElement(" ");
 543 
 544         wordSelectionData.addElement("Now");
 545         wordSelectionData.addElement("\r");
 546         wordSelectionData.addElement("is");
 547         wordSelectionData.addElement("\n");
 548         wordSelectionData.addElement("the");
 549         wordSelectionData.addElement("\r\n");
 550         wordSelectionData.addElement("time");
 551         wordSelectionData.addElement("\n");
 552         wordSelectionData.addElement("\r");
 553         wordSelectionData.addElement("for");
 554         wordSelectionData.addElement("\r");
 555         wordSelectionData.addElement("\r");
 556         wordSelectionData.addElement("all");
 557         wordSelectionData.addElement(" ");
 558 
 559         generalIteratorTest(wordBreak, wordSelectionData);
 560     }
 561 
 562     public void TestBug4097779() {
 563         Vector<String> wordSelectionData = new Vector<String>();
 564 
 565         wordSelectionData.addElement("aa\u0300a");
 566         wordSelectionData.addElement(" ");
 567 
 568         generalIteratorTest(wordBreak, wordSelectionData);
 569     }
 570 
 571     public void TestBug4098467Words() {
 572         Vector<String> wordSelectionData = new Vector<String>();
 573 
 574         // What follows is a string of Korean characters (I found it in the Yellow Pages
 575         // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
 576         // it correctly), first as precomposed syllables, and then as conjoining jamo.
 577         // Both sequences should be semantically identical and break the same way.
 578         // precomposed syllables...
 579         wordSelectionData.addElement("\uc0c1\ud56d");
 580         wordSelectionData.addElement(" ");
 581         wordSelectionData.addElement("\ud55c\uc778");
 582         wordSelectionData.addElement(" ");
 583         wordSelectionData.addElement("\uc5f0\ud569");
 584         wordSelectionData.addElement(" ");
 585         wordSelectionData.addElement("\uc7a5\ub85c\uad50\ud68c");
 586         wordSelectionData.addElement(" ");
 587         // conjoining jamo...
 588         wordSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc");
 589         wordSelectionData.addElement(" ");
 590         wordSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab");
 591         wordSelectionData.addElement(" ");
 592         wordSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8");
 593         wordSelectionData.addElement(" ");
 594         wordSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c");
 595         wordSelectionData.addElement(" ");
 596 
 597         generalIteratorTest(wordBreak, wordSelectionData);
 598     }
 599 
 600     public void TestBug4117554Words() {
 601         Vector<String> wordSelectionData = new Vector<String>();
 602 
 603         // this is a test for bug #4117554: the ideographic iteration mark (U+3005) should
 604         // count as a Kanji character for the purposes of word breaking
 605         wordSelectionData.addElement("abc");
 606         wordSelectionData.addElement("\u4e01\u4e02\u3005\u4e03\u4e03");
 607         wordSelectionData.addElement("abc");
 608 
 609         generalIteratorTest(wordBreak, wordSelectionData);
 610     }
 611 
 612     public void TestSentenceBreak() {
 613         Vector<String> sentenceSelectionData = new Vector<String>();
 614 
 615         sentenceSelectionData.addElement("This is a simple sample sentence. ");
 616         sentenceSelectionData.addElement("(This is it.) ");
 617         sentenceSelectionData.addElement("This is a simple sample sentence. ");
 618         sentenceSelectionData.addElement("\"This isn\'t it.\" ");
 619         sentenceSelectionData.addElement("Hi! ");
 620         sentenceSelectionData.addElement("This is a simple sample sentence. ");
 621         sentenceSelectionData.addElement("It does not have to make any sense as you can see. ");
 622         sentenceSelectionData.addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ");
 623         sentenceSelectionData.addElement("Che la dritta via aveo smarrita. ");
 624         sentenceSelectionData.addElement("He said, that I said, that you said!! ");
 625 
 626         sentenceSelectionData.addElement("Don't rock the boat.\u2029");
 627 
 628         sentenceSelectionData.addElement("Because I am the daddy, that is why. ");
 629         sentenceSelectionData.addElement("Not on my time (el timo.)! ");
 630 
 631         sentenceSelectionData.addElement("So what!!\u2029");
 632 
 633         sentenceSelectionData.addElement("\"But now,\" he said, \"I know!\" ");
 634         sentenceSelectionData.addElement("Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). ");
 635         sentenceSelectionData.addElement("One species, B. anthracis, is highly virulent.\n");
 636         sentenceSelectionData.addElement("Wolf said about Sounder:\"Beautifully thought-out and directed.\" ");
 637         sentenceSelectionData.addElement("Have you ever said, \"This is where \tI shall live\"? ");
 638         sentenceSelectionData.addElement("He answered, \"You may not!\" ");
 639         sentenceSelectionData.addElement("Another popular saying is: \"How do you do?\". ");
 640         sentenceSelectionData.addElement("Yet another popular saying is: \'I\'m fine thanks.\' ");
 641         sentenceSelectionData.addElement("What is the proper use of the abbreviation pp.? ");
 642         sentenceSelectionData.addElement("Yes, I am definatelly 12\" tall!!");
 643 
 644         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 645     }
 646 
 647     public void TestBug4113835() {
 648         Vector<String> sentenceSelectionData = new Vector<String>();
 649 
 650         // test for bug #4113835: \n and \r count as spaces, not as paragraph breaks
 651         sentenceSelectionData.addElement("Now\ris\nthe\r\ntime\n\rfor\r\rall\u2029");
 652 
 653         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 654     }
 655 
 656     public void TestBug4111338() {
 657         Vector<String> sentenceSelectionData = new Vector<String>();
 658 
 659         // test for bug #4111338: Don't break sentences at the boundary between CJK
 660         // and other letters
 661         sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:\"JAVA\u821c"
 662                 + "\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba"
 663                 + "\u611d\u57b6\u2510\u5d46\".\u2029");
 664         sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8"
 665                 + "\u97e4JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0"
 666                 + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
 667         sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4"
 668                 + "\u6470\u8790JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8"
 669                 + "\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
 670         sentenceSelectionData.addElement("He said, \"I can go there.\"\u2029");
 671 
 672         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 673     }
 674 
 675     public void TestBug4117554Sentences() {
 676         Vector<String> sentenceSelectionData = new Vector<String>();
 677 
 678         // Treat fullwidth variants of .!? the same as their
 679         // normal counterparts
 680         sentenceSelectionData.addElement("I know I'm right\uff0e ");
 681         sentenceSelectionData.addElement("Right\uff1f ");
 682         sentenceSelectionData.addElement("Right\uff01 ");
 683 
 684         // Don't break sentences at boundary between CJK and digits
 685         sentenceSelectionData.addElement("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8"
 686                 + "\u97e48888\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0"
 687                 + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
 688 
 689         // Break sentence between a sentence terminator and
 690         // opening punctuation
 691         sentenceSelectionData.addElement("no?");
 692         sentenceSelectionData.addElement("(yes)");
 693 
 694         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 695     }
 696 
 697     public void TestBug4158381() {
 698         Vector<String> sentenceSelectionData = new Vector<String>();
 699 
 700         // Don't break sentence after period if it isn't followed by a space
 701         sentenceSelectionData.addElement("Test <code>Flags.Flag</code> class.  ");
 702         sentenceSelectionData.addElement("Another test.\u2029");
 703 
 704         // No breaks when there are no terminators around
 705         sentenceSelectionData.addElement("<P>Provides a set of "
 706                 + "&quot;lightweight&quot; (all-java<FONT SIZE=\"-2\"><SUP>TM"
 707                 + "</SUP></FONT> language) components that, "
 708                 + "to the maximum degree possible, work the same on all platforms.  ");
 709         sentenceSelectionData.addElement("Another test.\u2029");
 710 
 711         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 712     }
 713 
 714     public void TestBug4143071() {
 715         Vector<String> sentenceSelectionData = new Vector<String>();
 716 
 717         // Make sure sentences that end with digits work right
 718         sentenceSelectionData.addElement("Today is the 27th of May, 1998.  ");
 719         sentenceSelectionData.addElement("Tomorrow with be 28 May 1998.  ");
 720         sentenceSelectionData.addElement("The day after will be the 30th.\u2029");
 721 
 722         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 723     }
 724 
 725     public void TestBug4152416() {
 726         Vector<String> sentenceSelectionData = new Vector<String>();
 727 
 728         // Make sure sentences ending with a capital letter are treated correctly
 729         sentenceSelectionData.addElement("The type of all primitive "
 730                 + "<code>boolean</code> values accessed in the target VM.  ");
 731         sentenceSelectionData.addElement("Calls to xxx will return an "
 732                 + "implementor of this interface.\u2029");
 733 
 734         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 735     }
 736 
 737     public void TestBug4152117() {
 738         Vector<String> sentenceSelectionData = new Vector<String>();
 739 
 740         // Make sure sentence breaking is handling punctuation correctly
 741         // [COULD NOT REPRODUCE THIS BUG, BUT TEST IS HERE TO MAKE SURE
 742         // IT DOESN'T CROP UP]
 743         sentenceSelectionData.addElement("Constructs a randomly generated "
 744                 + "BigInteger, uniformly distributed over the range <tt>0</tt> "
 745                 + "to <tt>(2<sup>numBits</sup> - 1)</tt>, inclusive.  ");
 746         sentenceSelectionData.addElement("The uniformity of the distribution "
 747                 + "assumes that a fair source of random bits is provided in "
 748                 + "<tt>rnd</tt>.  ");
 749         sentenceSelectionData.addElement("Note that this constructor always "
 750                 + "constructs a non-negative BigInteger.\u2029");
 751 
 752         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 753     }
 754 
 755     public void TestBug8264765() {
 756         Vector<String> sentenceSelectionData = new Vector<String>();
 757 
 758         // Comma should not be regarded as the start of a sentence,
 759         // otherwise the backwards rule would break the following sentence.
 760         sentenceSelectionData.addElement(
 761             "Due to a problem (e.g., software bug), the server is down. ");
 762 
 763         generalIteratorTest(sentenceBreak, sentenceSelectionData);
 764     }
 765 
 766     public void TestLineBreak() {
 767         Vector<String> lineSelectionData = new Vector<String>();
 768 
 769         lineSelectionData.addElement("Multi-");
 770         lineSelectionData.addElement("Level ");
 771         lineSelectionData.addElement("example ");
 772         lineSelectionData.addElement("of ");
 773         lineSelectionData.addElement("a ");
 774         lineSelectionData.addElement("semi-");
 775         lineSelectionData.addElement("idiotic ");
 776         lineSelectionData.addElement("non-");
 777         lineSelectionData.addElement("sensical ");
 778         lineSelectionData.addElement("(non-");
 779         lineSelectionData.addElement("important) ");
 780         lineSelectionData.addElement("sentence. ");
 781 
 782         lineSelectionData.addElement("Hi  ");
 783         lineSelectionData.addElement("Hello ");
 784         lineSelectionData.addElement("How\n");
 785         lineSelectionData.addElement("are\r");
 786         lineSelectionData.addElement("you\u2028");
 787         lineSelectionData.addElement("fine.\t");
 788         lineSelectionData.addElement("good.  ");
 789 
 790         lineSelectionData.addElement("Now\r");
 791         lineSelectionData.addElement("is\n");
 792         lineSelectionData.addElement("the\r\n");
 793         lineSelectionData.addElement("time\n");
 794         lineSelectionData.addElement("\r");
 795         lineSelectionData.addElement("for\r");
 796         lineSelectionData.addElement("\r");
 797         lineSelectionData.addElement("all");
 798 
 799         generalIteratorTest(lineBreak, lineSelectionData);
 800     }
 801 
 802     public void TestBug4068133() {
 803         Vector<String> lineSelectionData = new Vector<String>();
 804 
 805         lineSelectionData.addElement("\u96f6");
 806         lineSelectionData.addElement("\u4e00\u3002");
 807         lineSelectionData.addElement("\u4e8c\u3001");
 808         lineSelectionData.addElement("\u4e09\u3002\u3001");
 809         lineSelectionData.addElement("\u56db\u3001\u3002\u3001");
 810         lineSelectionData.addElement("\u4e94,");
 811         lineSelectionData.addElement("\u516d.");
 812         lineSelectionData.addElement("\u4e03.\u3001,\u3002");
 813         lineSelectionData.addElement("\u516b");
 814 
 815         generalIteratorTest(lineBreak, lineSelectionData);
 816     }
 817 
 818     public void TestBug4086052() {
 819         Vector<String> lineSelectionData = new Vector<String>();
 820 
 821         lineSelectionData.addElement("foo\u00a0bar ");
 822 //        lineSelectionData.addElement("foo\ufeffbar");
 823 
 824         generalIteratorTest(lineBreak, lineSelectionData);
 825     }
 826 
 827     public void TestBug4097920() {
 828         Vector<String> lineSelectionData = new Vector<String>();
 829 
 830         lineSelectionData.addElement("dog,");
 831         lineSelectionData.addElement("cat,");
 832         lineSelectionData.addElement("mouse ");
 833         lineSelectionData.addElement("(one)");
 834         lineSelectionData.addElement("(two)\n");
 835 
 836         generalIteratorTest(lineBreak, lineSelectionData);
 837     }
 838     /*
 839     public void TestBug4035266() {
 840         Vector<String> lineSelectionData = new Vector<String>();
 841 
 842         lineSelectionData.addElement("The ");
 843         lineSelectionData.addElement("balance ");
 844         lineSelectionData.addElement("is ");
 845         lineSelectionData.addElement("$-23,456.78, ");
 846         lineSelectionData.addElement("not ");
 847         lineSelectionData.addElement("-$32,456.78!\n");
 848 
 849         generalIteratorTest(lineBreak, lineSelectionData);
 850     }
 851     */
 852     public void TestBug4098467Lines() {
 853         Vector<String> lineSelectionData = new Vector<String>();
 854 
 855         // What follows is a string of Korean characters (I found it in the Yellow Pages
 856         // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
 857         // it correctly), first as precomposed syllables, and then as conjoining jamo.
 858         // Both sequences should be semantically identical and break the same way.
 859         // precomposed syllables...
 860         lineSelectionData.addElement("\uc0c1");
 861         lineSelectionData.addElement("\ud56d ");
 862         lineSelectionData.addElement("\ud55c");
 863         lineSelectionData.addElement("\uc778 ");
 864         lineSelectionData.addElement("\uc5f0");
 865         lineSelectionData.addElement("\ud569 ");
 866         lineSelectionData.addElement("\uc7a5");
 867         lineSelectionData.addElement("\ub85c");
 868         lineSelectionData.addElement("\uad50");
 869         lineSelectionData.addElement("\ud68c ");
 870         // conjoining jamo...
 871         lineSelectionData.addElement("\u1109\u1161\u11bc\u1112\u1161\u11bc ");
 872         lineSelectionData.addElement("\u1112\u1161\u11ab\u110b\u1175\u11ab ");
 873         lineSelectionData.addElement("\u110b\u1167\u11ab\u1112\u1161\u11b8 ");
 874         lineSelectionData.addElement("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c");
 875 
 876         if (Locale.getDefault().getLanguage().equals("th")) {
 877             logln("This test is skipped in th locale.");
 878             return;
 879         }
 880 
 881         generalIteratorTest(lineBreak, lineSelectionData);
 882     }
 883 
 884     public void TestBug4117554Lines() {
 885         Vector<String> lineSelectionData = new Vector<String>();
 886 
 887         // Fullwidth .!? should be treated as postJwrd
 888         lineSelectionData.addElement("\u4e01\uff0e");
 889         lineSelectionData.addElement("\u4e02\uff01");
 890         lineSelectionData.addElement("\u4e03\uff1f");
 891 
 892         generalIteratorTest(lineBreak, lineSelectionData);
 893     }
 894 
 895     public void TestBug4217703() {
 896         if (Locale.getDefault().getLanguage().equals("th")) {
 897             logln("This test is skipped in th locale.");
 898             return;
 899         }
 900 
 901         Vector<String> lineSelectionData = new Vector<String>();
 902 
 903         // There shouldn't be a line break between sentence-ending punctuation
 904         // and a closing quote
 905         lineSelectionData.addElement("He ");
 906         lineSelectionData.addElement("said ");
 907         lineSelectionData.addElement("\"Go!\"  ");
 908         lineSelectionData.addElement("I ");
 909         lineSelectionData.addElement("went.  ");
 910 
 911         lineSelectionData.addElement("Hashtable$Enumeration ");
 912         lineSelectionData.addElement("getText().");
 913         lineSelectionData.addElement("getIndex()");
 914 
 915         generalIteratorTest(lineBreak, lineSelectionData);
 916     }
 917 
 918     private static final String graveS = "S\u0300";
 919     private static final String acuteBelowI = "i\u0317";
 920     private static final String acuteE = "e\u0301";
 921     private static final String circumflexA = "a\u0302";
 922     private static final String tildeE = "e\u0303";
 923 
 924     public void TestCharacterBreak() {
 925         Vector<String> characterSelectionData = new Vector<String>();
 926 
 927         characterSelectionData.addElement(graveS);
 928         characterSelectionData.addElement(acuteBelowI);
 929         characterSelectionData.addElement("m");
 930         characterSelectionData.addElement("p");
 931         characterSelectionData.addElement("l");
 932         characterSelectionData.addElement(acuteE);
 933         characterSelectionData.addElement(" ");
 934         characterSelectionData.addElement("s");
 935         characterSelectionData.addElement(circumflexA);
 936         characterSelectionData.addElement("m");
 937         characterSelectionData.addElement("p");
 938         characterSelectionData.addElement("l");
 939         characterSelectionData.addElement(tildeE);
 940         characterSelectionData.addElement(".");
 941         characterSelectionData.addElement("w");
 942         characterSelectionData.addElement(circumflexA);
 943         characterSelectionData.addElement("w");
 944         characterSelectionData.addElement("a");
 945         characterSelectionData.addElement("f");
 946         characterSelectionData.addElement("q");
 947         characterSelectionData.addElement("\n");
 948         characterSelectionData.addElement("\r");
 949         characterSelectionData.addElement("\r\n");
 950         characterSelectionData.addElement("\n");
 951 
 952         generalIteratorTest(characterBreak, characterSelectionData);
 953     }
 954 
 955     public void TestBug4098467Characters() {
 956         Vector<String> characterSelectionData = new Vector<String>();
 957 
 958         // What follows is a string of Korean characters (I found it in the Yellow Pages
 959         // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
 960         // it correctly), first as precomposed syllables, and then as conjoining jamo.
 961         // Both sequences should be semantically identical and break the same way.
 962         // precomposed syllables...
 963         characterSelectionData.addElement("\uc0c1");
 964         characterSelectionData.addElement("\ud56d");
 965         characterSelectionData.addElement(" ");
 966         characterSelectionData.addElement("\ud55c");
 967         characterSelectionData.addElement("\uc778");
 968         characterSelectionData.addElement(" ");
 969         characterSelectionData.addElement("\uc5f0");
 970         characterSelectionData.addElement("\ud569");
 971         characterSelectionData.addElement(" ");
 972         characterSelectionData.addElement("\uc7a5");
 973         characterSelectionData.addElement("\ub85c");
 974         characterSelectionData.addElement("\uad50");
 975         characterSelectionData.addElement("\ud68c");
 976         characterSelectionData.addElement(" ");
 977         // conjoining jamo...
 978         characterSelectionData.addElement("\u1109\u1161\u11bc");
 979         characterSelectionData.addElement("\u1112\u1161\u11bc");
 980         characterSelectionData.addElement(" ");
 981         characterSelectionData.addElement("\u1112\u1161\u11ab");
 982         characterSelectionData.addElement("\u110b\u1175\u11ab");
 983         characterSelectionData.addElement(" ");
 984         characterSelectionData.addElement("\u110b\u1167\u11ab");
 985         characterSelectionData.addElement("\u1112\u1161\u11b8");
 986         characterSelectionData.addElement(" ");
 987         characterSelectionData.addElement("\u110c\u1161\u11bc");
 988         characterSelectionData.addElement("\u1105\u1169");
 989         characterSelectionData.addElement("\u1100\u116d");
 990         characterSelectionData.addElement("\u1112\u116c");
 991 
 992         generalIteratorTest(characterBreak, characterSelectionData);
 993     }
 994 
 995     public void TestBug4153072() {
 996         BreakIterator iter = BreakIterator.getWordInstance();
 997         String str = "...Hello, World!...";
 998         int begin = 3;
 999         int end = str.length() - 3;
1000         boolean gotException = false;
1001         boolean dummy;
1002 
1003         iter.setText(new StringCharacterIterator(str, begin, end, begin));
1004         for (int index = -1; index < begin + 1; ++index) {
1005             try {
1006                 dummy = iter.isBoundary(index);
1007                 if (index < begin)
1008                     errln("Didn't get exception with offset = " + index +
1009                                     " and begin index = " + begin);
1010             }
1011             catch (IllegalArgumentException e) {
1012                 if (index >= begin)
1013                     errln("Got exception with offset = " + index +
1014                                     " and begin index = " + begin);
1015             }
1016         }
1017     }
1018 
1019     public void TestBug4146175Sentences() {
1020         Vector<String> sentenceSelectionData = new Vector<String>();
1021 
1022         // break between periods and opening punctuation even when there's no
1023         // intervening space
1024         sentenceSelectionData.addElement("end.");
1025         sentenceSelectionData.addElement("(This is\u2029");
1026 
1027         // treat the fullwidth period as an unambiguous sentence terminator
1028         sentenceSelectionData.addElement("\u7d42\u308f\u308a\uff0e");
1029         sentenceSelectionData.addElement("\u300c\u3053\u308c\u306f");
1030 
1031         generalIteratorTest(sentenceBreak, sentenceSelectionData);
1032     }
1033 
1034     public void TestBug4146175Lines() {
1035         if (Locale.getDefault().getLanguage().equals("th")) {
1036             logln("This test is skipped in th locale.");
1037             return;
1038         }
1039 
1040         Vector<String> lineSelectionData = new Vector<String>();
1041 
1042         // the fullwidth comma should stick to the preceding Japanese character
1043         lineSelectionData.addElement("\u7d42\uff0c");
1044         lineSelectionData.addElement("\u308f");
1045 
1046         generalIteratorTest(lineBreak, lineSelectionData);
1047     }
1048 
1049     public void TestBug4214367() {
1050         if (Locale.getDefault().getLanguage().equals("th")) {
1051             logln("This test is skipped in th locale.");
1052             return;
1053         }
1054 
1055         Vector<String> wordSelectionData = new Vector<String>();
1056 
1057         // the hiragana and katakana iteration marks and the long vowel mark
1058         // are not being treated correctly by the word-break iterator
1059         wordSelectionData.addElement("\u3042\u3044\u309d\u3042\u309e\u3042\u30fc\u3042");
1060         wordSelectionData.addElement("\u30a2\u30a4\u30fd\u30a2\u30fe\u30a2\u30fc\u30a2");
1061 
1062         generalIteratorTest(wordBreak, wordSelectionData);
1063     }
1064 
1065     private static final String cannedTestChars // characters fo the class Cc are ignorable for breaking
1066         = /*"\u0000\u0001\u0002\u0003\u0004*/" !\"#$%&()+-01234<=>ABCDE[]^_`abcde{}|\u00a0\u00a2"
1067         + "\u00a3\u00a4\u00a5\u00a6\u00a7\u00a8\u00a9\u00ab\u00ad\u00ae\u00af\u00b0\u00b2\u00b3"
1068         + "\u00b4\u00b9\u00bb\u00bc\u00bd\u02b0\u02b1\u02b2\u02b3\u02b4\u0300\u0301\u0302\u0303"
1069         + "\u0304\u05d0\u05d1\u05d2\u05d3\u05d4\u0903\u093e\u093f\u0940\u0949\u0f3a\u0f3b\u2000"
1070         + "\u2001\u2002\u200c\u200d\u200e\u200f\u2010\u2011\u2012\u2028\u2029\u202a\u203e\u203f"
1071         + "\u2040\u20dd\u20de\u20df\u20e0\u2160\u2161\u2162\u2163\u2164";
1072 
1073     public void TestSentenceInvariants()
1074     {
1075         BreakIterator e = BreakIterator.getSentenceInstance();
1076         doOtherInvariantTest(e, cannedTestChars + ".,\u3001\u3002\u3041\u3042\u3043\ufeff");
1077     }
1078 
1079     public void TestWordInvariants()
1080     {
1081         if (Locale.getDefault().getLanguage().equals("th")) {
1082             logln("This test is skipped in th locale.");
1083             return;
1084         }
1085 
1086         BreakIterator e = BreakIterator.getWordInstance();
1087         doBreakInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2"
1088             + "\u30a3\u4e00\u4e01\u4e02");
1089         doOtherInvariantTest(e, cannedTestChars + "\',.\u3041\u3042\u3043\u309b\u309c\u30a1\u30a2"
1090             + "\u30a3\u4e00\u4e01\u4e02");
1091     }
1092 
1093     public void TestLineInvariants()
1094     {
1095         if (Locale.getDefault().getLanguage().equals("th")) {
1096             logln("This test is skipped in th locale.");
1097             return;
1098         }
1099 
1100         BreakIterator e = BreakIterator.getLineInstance();
1101         String testChars = cannedTestChars + ".,;:\u3001\u3002\u3041\u3042\u3043\u3044\u3045"
1102             + "\u30a3\u4e00\u4e01\u4e02";
1103         doBreakInvariantTest(e, testChars);
1104         doOtherInvariantTest(e, testChars);
1105 
1106         int errorCount = 0;
1107 
1108         // in addition to the other invariants, a line-break iterator should make sure that:
1109         // it doesn't break around the non-breaking characters
1110         String noBreak = "\u00a0\u2007\u2011\ufeff";
1111         StringBuffer work = new StringBuffer("aaa");
1112         for (int i = 0; i < testChars.length(); i++) {
1113             char c = testChars.charAt(i);
1114             if (c == '\r' || c == '\n' || c == '\u2029' || c == '\u2028' || c == '\u0003')
1115                 continue;
1116             work.setCharAt(0, c);
1117             for (int j = 0; j < noBreak.length(); j++) {
1118                 work.setCharAt(1, noBreak.charAt(j));
1119                 for (int k = 0; k < testChars.length(); k++) {
1120                     work.setCharAt(2, testChars.charAt(k));
1121                     // CONTROL (Cc) and FORMAT (Cf) Characters are to be ignored
1122                     // for breaking purposes as per UTR14
1123                     int type1 = Character.getType(work.charAt(1));
1124                     int type2 = Character.getType(work.charAt(2));
1125                     if (type1 == Character.CONTROL || type1 == Character.FORMAT ||
1126                         type2 == Character.CONTROL || type2 == Character.FORMAT) {
1127                         continue;
1128                     }
1129                     e.setText(work.toString());
1130                     for (int l = e.first(); l != BreakIterator.DONE; l = e.next()) {
1131                         if (l == 1 || l == 2) {
1132                             //errln("Got break between U+" + Integer.toHexString((int)
1133                             //        (work.charAt(l - 1))) + " and U+" + Integer.toHexString(
1134                             //        (int)(work.charAt(l))) + "\ntype1 = " + type1 + "\ntype2 = " + type2);
1135                             // as per UTR14 spaces followed by a GLUE character should allow
1136                             // line breaking
1137                             if (work.charAt(l-1) == '\u0020' && (work.charAt(l) == '\u00a0' ||
1138                                                                  work.charAt(l) == '\u0f0c' ||
1139                                                                  work.charAt(l) == '\u2007' ||
1140                                                                  work.charAt(l) == '\u2011' ||
1141                                                                  work.charAt(l) == '\u202f' ||
1142                                                                  work.charAt(l) == '\ufeff')) {
1143                                 continue;
1144                             }
1145                             errln("Got break between U+" + Integer.toHexString((int)
1146                                     (work.charAt(l - 1))) + " and U+" + Integer.toHexString(
1147                                     (int)(work.charAt(l))));
1148                             errorCount++;
1149                             if (errorCount >= 75)
1150                                 return;
1151                         }
1152                     }
1153                 }
1154             }
1155         }
1156 
1157         // The following test has so many exceptions that it would be better to write a new set of data
1158         // that tested exactly what should be tested
1159         // Until that point it will be commented out
1160         /*
1161 
1162         // it does break after dashes (unless they're followed by a digit, a non-spacing mark,
1163         // a currency symbol, a space, a format-control character, a regular control character,
1164         // a line or paragraph separator, or another dash)
1165         String dashes = "-\u00ad\u2010\u2012\u2013\u2014";
1166         for (int i = 0; i < testChars.length(); i++) {
1167             work.setCharAt(0, testChars.charAt(i));
1168             for (int j = 0; j < dashes.length(); j++) {
1169                 work.setCharAt(1, dashes.charAt(j));
1170                 for (int k = 0; k < testChars.length(); k++) {
1171                     char c = testChars.charAt(k);
1172                     if (Character.getType(c) == Character.DECIMAL_DIGIT_NUMBER ||
1173                         Character.getType(c) == Character.OTHER_NUMBER ||
1174                         Character.getType(c) == Character.NON_SPACING_MARK ||
1175                         Character.getType(c) == Character.ENCLOSING_MARK ||
1176                         Character.getType(c) == Character.CURRENCY_SYMBOL ||
1177                         Character.getType(c) == Character.DASH_PUNCTUATION ||
1178                         Character.getType(c) == Character.SPACE_SEPARATOR ||
1179                         Character.getType(c) == Character.FORMAT ||
1180                         Character.getType(c) == Character.CONTROL ||
1181                         Character.getType(c) == Character.END_PUNCTUATION ||
1182                         Character.getType(c) == Character.FINAL_QUOTE_PUNCTUATION ||
1183                         Character.getType(c) == Character.OTHER_PUNCTUATION ||
1184                         c == '\'' || c == '\"' ||
1185                         // category EX as per UTR14
1186                         c == '!' || c == '?' || c == '\ufe56' || c == '\ufe57' || c == '\uff01' || c == '\uff1f' ||
1187                         c == '\n' || c == '\r' || c == '\u2028' || c == '\u2029' ||
1188                         c == '\u0003' || c == '\u2007' || c == '\u2011' ||
1189                         c == '\ufeff')
1190                         continue;
1191                     work.setCharAt(2, c);
1192                     e.setText(work.toString());
1193                     boolean saw2 = false;
1194                     for (int l = e.first(); l != BreakIterator.DONE; l = e.next())
1195                         if (l == 2)
1196                             saw2 = true;
1197                     if (!saw2) {
1198                         errln("Didn't get break between U+" + Integer.toHexString((int)
1199                                     (work.charAt(1))) + " and U+" + Integer.toHexString(
1200                                     (int)(work.charAt(2))));
1201                         errorCount++;
1202                         if (errorCount >= 75)
1203                             return;
1204                     }
1205                 }
1206             }
1207         }
1208         */
1209     }
1210 
1211     public void TestCharacterInvariants()
1212     {
1213         BreakIterator e = BreakIterator.getCharacterInstance();
1214         doBreakInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8"
1215             + "\u11a9\u11aa");
1216         doOtherInvariantTest(e, cannedTestChars + "\u1100\u1101\u1102\u1160\u1161\u1162\u11a8"
1217             + "\u11a9\u11aa");
1218     }
1219 
1220     public void TestEmptyString()
1221     {
1222         String text = "";
1223         Vector<String> x = new Vector<String>();
1224         x.addElement(text);
1225 
1226         generalIteratorTest(lineBreak, x);
1227     }
1228 
1229     public void TestGetAvailableLocales()
1230     {
1231         Locale[] locList = BreakIterator.getAvailableLocales();
1232 
1233         if (locList.length == 0)
1234             errln("getAvailableLocales() returned an empty list!");
1235         // I have no idea how to test this function...
1236     }
1237 
1238 
1239     /**
1240      * Bug 4095322
1241      */
1242     public void TestJapaneseLineBreak()
1243     {
1244         StringBuffer testString = new StringBuffer("\u4e00x\u4e8c");
1245         // Breaking on <Kanji>$<Kanji> is inconsistent
1246 
1247         /* Characters in precedingChars and followingChars have been updated
1248          * from Unicode 2.0.14-based to 3.0.0-based when 4638433 was fixed.
1249          * In concrete terms,
1250          *   0x301F : Its category was changed from Ps to Pe since Unicode 2.1.
1251          *   0x169B & 0x169C : added since Unicode 3.0.0.
1252          */
1253         String precedingChars =
1254             /* Puctuation, Open */
1255           "([{\u201a\u201e\u2045\u207d\u208d\u2329\u3008\u300a\u300c\u300e\u3010\u3014\u3016\u3018\u301a\u301d\ufe35\ufe37\ufe39\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe59\ufe5b\ufe5d\uff08\uff3b\uff5b\uff62\u169b"
1256             /* Punctuation, Initial quote */
1257           + "\u00ab\u2018\u201b\u201c\u201f\u2039"
1258             /* Symbol, Currency */
1259           + "\u00a5\u00a3\u00a4\u20a0";
1260 
1261         String followingChars =
1262             /* Puctuation, Close */
1263           ")]}\u2046\u207e\u208e\u232a\u3009\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b\u301e\u301f\ufd3e\ufe36\ufe38\ufe3a\ufe3c\ufe3e\ufe40\ufe42\ufe44\ufe5a\ufe5c\ufe5e\uff09\uff3d\uff5d\uff63\u169c"
1264             /* Punctuation, Final quote */
1265           + "\u00bb\u2019\u201d\u203a"
1266             /* Punctuation, Other */
1267           + "!%,.:;\u3001\u3002\u2030\u2031\u2032\u2033\u2034"
1268             /* Punctuation, Dash */
1269           + "\u2103\u2109"
1270             /* Symbol, Currency */
1271           + "\u00a2"
1272             /* Letter, Modifier */
1273           + "\u3005\u309d\u309e"
1274             /* Letter, Other */
1275           + "\u3063\u3083\u3085\u3087\u30c3\u30e3\u30e5\u30e7\u30fc\u30fd\u30fe"
1276            /* Mark, Non-Spacing */
1277           + "\u0300\u0301\u0302"
1278             /* Symbol, Modifier */
1279           + "\u309b\u309c"
1280             /* Symbol, Other */
1281           + "\u00b0";
1282 
1283         BreakIterator iter = BreakIterator.getLineInstance(Locale.JAPAN);
1284 
1285         for (int i = 0; i < precedingChars.length(); i++) {
1286             testString.setCharAt(1, precedingChars.charAt(i));
1287             iter.setText(testString.toString());
1288             int j = iter.first();
1289             if (j != 0) {
1290                 errln("ja line break failure: failed to start at 0 and bounced at " + j);
1291             }
1292             j = iter.next();
1293             if (j != 1) {
1294                 errln("ja line break failure: failed to stop before '"
1295                         + precedingChars.charAt(i) + "' (\\u"
1296                         + Integer.toString(precedingChars.charAt(i), 16)
1297                         + ") at 1 and bounded at " + j);
1298             }
1299             j = iter.next();
1300             if (j != 3) {
1301                 errln("ja line break failure: failed to skip position after '"
1302                         + precedingChars.charAt(i) + "' (\\u"
1303                         + Integer.toString(precedingChars.charAt(i), 16)
1304                         + ") at 3 and bounded at " + j);
1305             }
1306         }
1307 
1308         for (int i = 0; i < followingChars.length(); i++) {
1309             testString.setCharAt(1, followingChars.charAt(i));
1310             iter.setText(testString.toString());
1311             int j = iter.first();
1312             if (j != 0) {
1313                 errln("ja line break failure: failed to start at 0 and bounded at " + j);
1314             }
1315             j = iter.next();
1316             if (j != 2) {
1317                 errln("ja line break failure: failed to skip position before '"
1318                         + followingChars.charAt(i) + "' (\\u"
1319                         + Integer.toString(followingChars.charAt(i), 16)
1320                         + ") at 2 and bounded at " + j);
1321             }
1322             j = iter.next();
1323             if (j != 3) {
1324                 errln("ja line break failure: failed to stop after '"
1325                         + followingChars.charAt(i) + "' (\\u"
1326                         + Integer.toString(followingChars.charAt(i), 16)
1327                         + ") at 3 and bounded at " + j);
1328             }
1329         }
1330     }
1331 
1332     /**
1333      * Bug 4638433
1334      */
1335     public void TestLineBreakBasedOnUnicode3_0_0()
1336     {
1337         BreakIterator iter;
1338         int i;
1339 
1340         /* Latin Extend-B characters
1341          * 0x0218-0x0233 which have been added since Unicode 3.0.0.
1342          */
1343         iter = BreakIterator.getWordInstance(Locale.US);
1344         iter.setText("\u0216\u0217\u0218\u0219\u021A");
1345         i = iter.first();
1346         i = iter.next();
1347         if (i != 5) {
1348             errln("Word break failure: failed to stop at 5 and bounded at " + i);
1349         }
1350 
1351 
1352         iter = BreakIterator.getLineInstance(Locale.US);
1353 
1354         /* <Three(Nd)><Two(Nd)><Low Double Prime Quotation Mark(Pe)><One(Nd)>
1355          * \u301f has changed its category from Ps to Pe since Unicode 2.1.
1356          */
1357         iter.setText("32\u301f1");
1358         i = iter.first();
1359         i = iter.next();
1360         if (i != 3) {
1361             errln("Line break failure: failed to skip before \\u301F(Pe) at 3 and bounded at " + i);
1362         }
1363 
1364         /* Mongolian <Letter A(Lo)><Todo Soft Hyphen(Pd)><Letter E(Lo)>
1365          * which have been added since Unicode 3.0.0.
1366          */
1367         iter.setText("\u1820\u1806\u1821");
1368         i = iter.first();
1369         i = iter.next();
1370         if (i != 2) {
1371             errln("Mongolian line break failure: failed to skip position before \\u1806(Pd) at 2 and bounded at " + i);
1372         }
1373 
1374         /* Khmer <ZERO(Nd)><Currency Symbol(Sc)><ONE(Nd)> which have
1375          * been added since Unicode 3.0.0.
1376          */
1377         iter.setText("\u17E0\u17DB\u17E1");
1378         i = iter.first();
1379         i = iter.next();
1380         if (i != 1) {
1381             errln("Khmer line break failure: failed to stop before \\u17DB(Sc) at 1 and bounded at " + i);
1382         }
1383         i = iter.next();
1384         if (i != 3) {
1385             errln("Khmer line break failure: failed to skip position after \\u17DB(Sc) at 3 and bounded at " + i);
1386         }
1387 
1388         /* Ogham <Letter UR(Lo)><Space Mark(Zs)><Letter OR(Lo)> which have
1389          * been added since Unicode 3.0.0.
1390          */
1391         iter.setText("\u1692\u1680\u1696");
1392         i = iter.first();
1393         i = iter.next();
1394         if (i != 2) {
1395             errln("Ogham line break failure: failed to skip postion before \\u1680(Zs) at 2 and bounded at " + i);
1396         }
1397 
1398 
1399         // Confirm changes in BreakIteratorRules_th.java have been reflected.
1400         iter = BreakIterator.getLineInstance(Locale.of("th"));
1401 
1402         /* Thai <Seven(Nd)>
1403          *      <Left Double Quotation Mark(Pi)>
1404          *      <Five(Nd)>
1405          *      <Right Double Quotation Mark(Pf)>
1406          *      <Three(Nd)>
1407          */
1408         iter.setText("\u0E57\u201C\u0E55\u201D\u0E53");
1409         i = iter.first();
1410         i = iter.next();
1411         if (i != 1) {
1412             errln("Thai line break failure: failed to stop before \\u201C(Pi) at 1 and bounded at " + i);
1413         }
1414         i = iter.next();
1415         if (i != 4) {
1416             errln("Thai line break failure: failed to stop after \\u201D(Pf) at 4 and bounded at " + i);
1417         }
1418     }
1419 
1420     /**
1421      * Bug 4068137
1422      */
1423     public void TestEndBehavior()
1424     {
1425         String testString = "boo.";
1426         BreakIterator wb = BreakIterator.getWordInstance();
1427         wb.setText(testString);
1428 
1429         if (wb.first() != 0)
1430             errln("Didn't get break at beginning of string.");
1431         if (wb.next() != 3)
1432             errln("Didn't get break before period in \"boo.\"");
1433         if (wb.current() != 4 && wb.next() != 4)
1434             errln("Didn't get break at end of string.");
1435     }
1436 
1437     // [serialization test has been removed pursuant to bug #4152965]
1438 
1439     /**
1440      * Bug 4450804
1441      */
1442     public void TestLineBreakContractions() {
1443         Vector<String> expected = new Vector<String>();
1444 
1445         expected.add("These ");
1446         expected.add("are ");
1447         expected.add("'foobles'. ");
1448         expected.add("Don't ");
1449         expected.add("you ");
1450         expected.add("like ");
1451         expected.add("them?");
1452         generalIteratorTest(lineBreak, expected);
1453     }
1454 
1455     private static final Pattern CODEPOINT = Pattern.compile("([0-9A-F]{4,5})");
1456     public void TestGraphemeBreak() throws Exception {
1457         Files.lines(Paths.get(System.getProperty("test.root"),
1458                 "../../src/java.base/share/data/unicodedata/auxiliary/GraphemeBreakTest.txt"))
1459                 .map(ln -> ln.replaceFirst("#.*", ""))
1460                 .filter(Predicate.not(String::isEmpty))
1461                 .map(line -> line.split("\\s*÷[\\s\\t]*"))
1462                 .forEach(sa -> {
1463                     Vector<String> expected = new Vector<>(
1464                         Arrays.stream(sa)
1465                             .map(line -> CODEPOINT.matcher(line).replaceAll(mr -> Character.toString(Integer.valueOf(mr.group(),16))))
1466                             .map(line -> line.replaceAll("\\s×\\s", ""))
1467                             .filter(Predicate.not(String::isEmpty))
1468                             .toList());
1469                     generalIteratorTest(characterBreak, expected);
1470                 });
1471     }
1472 
1473     public void TestSetTextIOOBException() {
1474         BreakIterator.getCharacterInstance().setText(new StringCharacterIterator("abcfefg", 1, 5, 3));
1475     }
1476 }