1 /*
   2  * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package build.tools.fixuppandoc;
  27 
  28 import java.io.BufferedReader;
  29 import java.io.BufferedWriter;
  30 import java.io.IOException;
  31 import java.io.InputStreamReader;
  32 import java.io.OutputStreamWriter;
  33 import java.io.PrintStream;
  34 import java.io.PrintWriter;
  35 import java.io.Reader;
  36 import java.io.Writer;
  37 import java.nio.file.Files;
  38 import java.nio.file.Path;
  39 import java.util.ArrayList;
  40 import java.util.LinkedHashMap;
  41 import java.util.LinkedHashSet;
  42 import java.util.List;
  43 import java.util.Locale;
  44 import java.util.Map;
  45 import java.util.Objects;
  46 import java.util.Set;
  47 import java.util.regex.Matcher;
  48 import java.util.regex.Pattern;
  49 
  50 /**
  51  * Fixup HTML generated by pandoc.
  52  *
  53  * <h2>{@code <html>}</h2>
  54  *
  55  * Replace the existing element with {@code <html lang="en">}, removing references to XML.
  56  *
  57  * <h2>{@code <main>}</h2>
  58  *
  59  * {@code <main>} is inserted if palpable content is found that is not with a
  60  * section such as {@code header},  {@code footer},  {@code aside}.
  61  *
  62  * {@code </main>} is inserted if {@code <main>} was inserted and a section
  63  * is started that should not be included in the main section.
  64  *
  65  * <h2>Tables: row headings</h2>
  66  *
  67  * For simple tables, as typically generated by _pandoc_, determine the column
  68  * whose contents are unique, and convert the cells in that column to be header
  69  * cells with {@code scope="row"}. In case of ambiguity, a column containing a
  70  * {@code <th>} whose contents begin with <em>name</em> is preferred.
  71  * When converting the cell, the {@code style} attribute will be updated to
  72  * specify {@code font-weight: normal}, and if there is not already an explicit
  73  * setting for {@code text-align}, then the style will be updated to include
  74  * {@code text-align:left;}.
  75  *
  76  * These rules do not apply if the table contains any cells that include
  77  * a setting for the {@code scope} attribute, or if the table contains
  78  * spanning cells or nested tables.
  79  *
  80  * <h2>{@code <meta name="generator">}</h2>
  81  *
  82  * Update the content string, to indicate it has been processed by this program.
  83  *
  84  */
  85 public class Main {
  86     /**
  87      * Runs the program.
  88      *
  89      * <pre>
  90      *     java build.tools.fixuppandoc.Main [-o output-file] [input-file]
  91      * </pre>
  92      *
  93      * If no input file is specified, the program will read from standard input.
  94      * If no output file is specified, the program will write to standard output.
  95      * Any error messages will be written to the standard error stream.
  96      *
  97      * @param args the command-line arguments
  98      */
  99     public static void main(String... args) {
 100         try {
 101             new Main().run(args);
 102         } catch (IOException | IllegalArgumentException e) {
 103             System.err.println(e);
 104             System.exit(1);
 105         } catch (Throwable t) {
 106             t.printStackTrace(System.err);
 107             System.exit(1);
 108         }
 109     }
 110 
 111     private void run(String... args) throws IOException {
 112         Path inFile = null;
 113         Path outFile = null;
 114 
 115         for (int i = 0; i < args.length; i++) {
 116             String arg = args[i];
 117             if (arg.equals("-o") && i + 1 < args.length) {
 118                 outFile = Path.of(args[++i]);
 119             } else if (arg.startsWith("-")) {
 120                 throw new IllegalArgumentException(arg);
 121             } else if (inFile == null) {
 122                 inFile = Path.of(arg);
 123             } else {
 124                 throw new IllegalArgumentException(arg);
 125             }
 126         }
 127 
 128         new Fixup().run(inFile, outFile);
 129     }
 130 
 131     /**
 132      * A class to read HTML, copying input to output, modifying
 133      * fragments as needed.
 134      */
 135     class Fixup extends HtmlParser {
 136         /** The output stream. */
 137         PrintWriter out;
 138 
 139         /** A stream for reporting errors. */
 140         PrintStream err = System.err;
 141 
 142         /**
 143          * Flag to indicate when {@code <main>} is permitted around palpable content.
 144          * Set within {@code <body>}; disabled within elements in which {@code <main>}
 145          * is not permitted.
 146          */
 147         boolean allowMain = false;
 148 
 149         /**
 150          * Flag to indicate that {@code <main>} is required.
 151          * Set on {@code <body>}; reset when {@code <main>} is either found or generated.
 152          */
 153         boolean needMain = false;
 154 
 155         /**
 156          * Flag to indicate that {@code </main>} is required.
 157          * Set if {@code <main>} is generated.
 158          * Reset when a start or end element is found that requires that {@code </main>}
 159          * needs to be generated if necessary.
 160          */
 161         boolean needEndMain = false;
 162 
 163         /**
 164          * Handler for {@code <table>} elements.
 165          */
 166         Table table;
 167 
 168         /**
 169          * Run the program, copying an input file to an output file.
 170          * If the input file is {@code null}, input is read from the standard input.
 171          * If the output file is {@code null}, output is written to the standard output.
 172          *
 173          * @param inFile the input file
 174          * @param outFile the output file
 175          * @throws IOException if an IO error occurs
 176          */
 177         void run(Path inFile, Path outFile) throws IOException {
 178             try (Writer out = openWriter(outFile)) {
 179                 this.out = new PrintWriter(out);
 180                 if (inFile != null) {
 181                     read(inFile);
 182                 } else {
 183                     read(new BufferedReader(new InputStreamReader(System.in)));
 184                 }
 185             }
 186         }
 187 
 188         /**
 189          * Returns a writer for a file, or for the standard output if the file is {@code null}.
 190          *
 191          * @param file the file
 192          * @return the writer
 193          * @throws IOException if an IO error occurs
 194          */
 195         private Writer openWriter(Path file) throws IOException {
 196             if (file != null) {
 197                 return Files.newBufferedWriter(file);
 198             } else {
 199                 return new BufferedWriter(new OutputStreamWriter(System.out) {
 200                     @Override
 201                     public void close() throws IOException {
 202                         flush();
 203                     }
 204                 });
 205             }
 206         }
 207 
 208         @Override
 209         protected void error(Path file, int lineNumber, String message) {
 210             err.print(file == null ? "<stdin>" : file);
 211             if (lineNumber > 0) {
 212                 err.print(":");
 213                 err.print(lineNumber);
 214             }
 215             err.print(": ");
 216             err.println(message);
 217         }
 218 
 219         @Override
 220         protected void error(Path file, int lineNumber, Throwable t) {
 221             error(file, lineNumber, t.toString());
 222             t.printStackTrace(err);
 223         }
 224 
 225         /**
 226          * The buffer in which input is stored until an appropriate action can be determined.
 227          * Using the buffer ensures that the output exactly matches the input, except where
 228          * it is intentionally modified.
 229          */
 230         private StringBuilder buffer = new StringBuilder();
 231 
 232         @Override
 233         public int nextChar() throws IOException {
 234             if (ch > 0) {
 235                 buffer.append((char) ch);
 236             }
 237             return super.nextChar();
 238         }
 239 
 240         @Override
 241         protected void doctype(String s) {
 242             flushBuffer();
 243         }
 244 
 245         @Override
 246         protected void startElement(String name, Map<String,String> attrs, boolean selfClosing) {
 247             switch (name) {
 248                 case "html":
 249                     // replace the existing <html> fragment
 250                     out.write("<html lang=\"en\">");
 251                     buffer.setLength(0);
 252                     break;
 253 
 254                 case "meta":
 255                     // update the meta-data for the generator
 256                     if (Objects.equals(attrs.get("name"), "generator")) {
 257                         out.write(buffer.toString()
 258                                 .replaceAll("(content=\"[^\"]*)(\")", "$1,fixuphtml$2"));
 259                         buffer.setLength(0);
 260                     }
 261                     break;
 262 
 263                 case "article":
 264                 case "aside":
 265                 case "footer":
 266                 case "header":
 267                 case "nav":
 268                     // starting one of these elements will terminate <main> if one is being
 269                     // inserted
 270                     if (needEndMain) {
 271                         out.write("</main>");
 272                         needEndMain = false;
 273                     }
 274                     // <main> is not permitted within these elements
 275                     allowMain = false;
 276                     break;
 277 
 278                 case "body":
 279                     // within <body>, <main> is both permitted and required
 280                     allowMain = true;
 281                     needMain = true;
 282                     break;
 283 
 284                 case "main":
 285                     // an explicit <main> found in the input; no need to add one
 286                     needMain = false;
 287                     break;
 288 
 289                 case "table":
 290                     // The entire content of a <table> is buffered, until it can be
 291                     // determined in which column of the table contains the cells
 292                     // that can be used to identify the row.
 293                     if (table == null) {
 294                         table = new Table();
 295                     } else {
 296                         // tables containing nested tables are not updated
 297                         table.simple = false;
 298                     }
 299                     table.nestDepth++;
 300                     break;
 301 
 302                 case "thead":
 303                 case "tbody":
 304                     if (table != null) {
 305                         table.endCell();
 306                     }
 307                     break;
 308 
 309                 case "tr":
 310                     if (table != null) {
 311                         table.endCell();
 312                         table.nextCellColumnIndex = 0;
 313                     }
 314                     break;
 315 
 316                 case "td":
 317                 case "th":
 318                     if (table != null) {
 319                         if (attrs.containsKey("rowspan")
 320                                 || attrs.containsKey("colspan")
 321                                 || attrs.containsKey("scope")) {
 322                             // tables containing spanning cells and tables that already
 323                             // contain scope attributes are not updated
 324                             table.simple = false;
 325                         }
 326                         table.startCell(name);
 327                     }
 328                     break;
 329             }
 330 
 331             // by default, the content is deemed to be palpable content, and so
 332             // insert <main> if it is permitted and one is still required,
 333             // while also ensuring that it does not appear before <body>
 334             if (allowMain && needMain && !name.equals("body")) {
 335                 out.write("<main>");
 336                 needMain = false;
 337                 needEndMain = true;
 338             }
 339 
 340             flushBuffer();
 341         }
 342 
 343         @Override
 344         protected void endElement(String name) {
 345             switch (name) {
 346                 case "article":
 347                 case "aside":
 348                 case "footer":
 349                 case "header":
 350                 case "nav":
 351                     // The code does not handle nested elements of these kinds, but could.
 352                     // So, assuming they are not nested, ending these elements implies
 353                     // that <main> is once again permitted.
 354                     allowMain = true;
 355                     break;
 356 
 357                 case "body":
 358                     // The document is nearly done; insert <main> if needed
 359                     if (needEndMain) {
 360                         out.write("</main>");
 361                         needEndMain = false;
 362                     }
 363                     break;
 364 
 365                 case "table":
 366                     // if the table is finished, analyze it and write it out
 367                     if (table != null) {
 368                         if (--table.nestDepth == 0) {
 369                             table.add(buffer.toString());
 370                             table.write(out);
 371                             table = null;
 372                             buffer.setLength(0);
 373                         }
 374                     }
 375                     break;
 376 
 377                 case "thead":
 378                 case "tbody":
 379                 case "tr":
 380                 case "td":
 381                 case "th":
 382                     // ending any of these elements implicity or explicitly ends the
 383                     // current cell
 384                     table.endCell();
 385                     break;
 386 
 387             }
 388             flushBuffer();
 389         }
 390 
 391         @Override
 392         protected void content(String content) {
 393             if (table != null) {
 394                 table.content(content);
 395             } else if (allowMain && needMain && !content.isBlank()) {
 396                 // insert <main> if required and if we have palpable content
 397                 out.write("<main>");
 398                 needMain = false;
 399                 needEndMain = true;
 400             }
 401             flushBuffer();
 402         }
 403 
 404         @Override
 405         protected void comment(String comment) {
 406             flushBuffer();
 407         }
 408 
 409         /**
 410          * Flushes the buffer, either by adding it into a table, if one is
 411          * in progress, or by writing it out.
 412          */
 413         private void flushBuffer() {
 414             String s = buffer.toString();
 415             if (table != null) {
 416                 table.add(s);
 417             } else {
 418                 out.write(s);
 419             }
 420             buffer.setLength(0);
 421 
 422         }
 423     }
 424 
 425     /**
 426      * Storage for the content of a {@code <table>} element} until we can determine
 427      * whether we should add {@code scope="row"} to the cells in a given column,
 428      * and if so, which column.
 429      *
 430      * The column with the highest number of unique entries is selected;
 431      * in case of ambiguity, a column whose heading begins "name" is chosen.
 432      *
 433      * Only "simple" tables are supported. Tables with any of the following
 434      * features are not considered "simple" and will not be modified:
 435      * <ul>
 436      *     <li>Tables containing nested tables</li>
 437      *     <li>Tables containing cells that use "rowspan" and "colspan" attributes</li>
 438      *     <li>Tables containing cells that already use "scope" attributes</li>
 439      * </ul>
 440      */
 441     class Table {
 442         /**
 443          * A fragment of HTML in this table.
 444          */
 445         class Entry {
 446             /** The fragment. */
 447             final String html;
 448             /** The column for a {@code <td>} fragment, or -1. */
 449             final int column;
 450 
 451             Entry(String html, int column) {
 452                 this.html = html;
 453                 this.column = column;
 454             }
 455         }
 456 
 457         /** Whether or not this is a "simple" table. */
 458         boolean simple = true;
 459 
 460         /** The nesting depth of the current table, within enclosing tables. */
 461         int nestDepth;
 462 
 463         /** A list of the HTML fragments that make up this table. */
 464         List<Entry> entries;
 465 
 466         /** The plain text contents of each column, used to determine the primary column. */
 467         List<Set<String>> columnContents;
 468 
 469         /** The column index of the next cell to be found. */
 470         int nextCellColumnIndex;
 471 
 472         /** A flag to mark the start of a {@code <td>} cell. */
 473         boolean startTDCell;
 474 
 475         /** The column index of the current cell, or -1 if not in a cell. */
 476         int currCellColumnIndex;
 477 
 478         /** The plain text contents of the current column. */
 479         Set<String> currColumnContents;
 480 
 481         /** The plain text content of the current cell. */
 482         StringBuilder currCellContent;
 483 
 484         /** The kind ({@code th} or {@code td}) of the current cell. */
 485         String currCellKind;
 486 
 487         /**
 488          * The index of the column, if any, containing a heading beginning "name".
 489          * This column is given preferential treatment when deciding the primary column.
 490          */
 491         int nameColumn;
 492 
 493         Table() {
 494             entries = new ArrayList<>();
 495             columnContents = new ArrayList<>();
 496         }
 497 
 498         void startCell(String name) {
 499             endCell();
 500             startTDCell = name.equals("td");
 501             currCellColumnIndex = nextCellColumnIndex++;
 502             currColumnContents = getColumn(currCellColumnIndex);
 503             currCellContent = new StringBuilder();
 504             currCellKind = name;
 505         }
 506 
 507         void endCell() {
 508             if (currCellContent != null) {
 509                 String c = currCellContent.toString().trim();
 510                 if (Objects.equals(currCellKind, "th")
 511                         && c.toLowerCase(Locale.US).startsWith("name")) {
 512                     nameColumn = currCellColumnIndex;
 513                 }
 514                 currColumnContents.add(c);
 515                 currCellContent = null;
 516                 currCellColumnIndex = -1;
 517                 currColumnContents = null;
 518             }
 519         }
 520 
 521         void content(String content) {
 522             if (currCellContent != null) {
 523                 currCellContent.append(content);
 524             }
 525         }
 526 
 527         void add(String html) {
 528             int index = startTDCell ? currCellColumnIndex : -1;
 529             entries.add(new Entry(html, index));
 530             startTDCell = false;
 531         }
 532 
 533         void write(PrintWriter out) {
 534             int max = -1;
 535             int maxIndex = -1;
 536             int index = 0;
 537             for (Set<String> c : columnContents) {
 538                 if (c.size() > max || c.size() == max && index == nameColumn) {
 539                     max = c.size();
 540                     maxIndex = index;
 541                 }
 542                 index++;
 543             }
 544             boolean updateEndTd = false;
 545             Pattern styleAttr = Pattern.compile("(?<before>.*style=\")(?<style>[^\"]*)(?<after>\".*)");
 546             for (Entry e : entries) {
 547                 if (simple && e.column == maxIndex) {
 548                     String attrs = e.html.substring(3, e.html.length() - 1);
 549                     out.write("<th");
 550                     Matcher m = styleAttr.matcher(attrs);
 551                     if (m.matches()) {
 552                         out.write(m.group("before"));
 553                         out.write("font-weight: normal; ");
 554                         String style = m.group("style");
 555                         if (!style.contains("text-align")) {
 556                             out.write("text-align: left; ");
 557                         }
 558                         out.write(style);
 559                         out.write(m.group("after"));
 560                     } else {
 561                         out.write(" style=\"font-weight: normal; text-align:left;\" ");
 562                         out.write(attrs);
 563                     }
 564                     out.write(" scope=\"row\"");
 565                     out.write(">");
 566                     updateEndTd = true;
 567                 } else if (updateEndTd && e.html.equalsIgnoreCase("</td>")) {
 568                     out.write("</th>");
 569                     updateEndTd = false;
 570                 } else {
 571                     out.write(e.html);
 572                     if (updateEndTd && e.html.regionMatches(true, 0, "<td", 0, 3)) {
 573                         // a new cell has been started without explicitly closing the
 574                         // cell that was being updated
 575                         updateEndTd = false;
 576                     }
 577                 }
 578             }
 579         }
 580 
 581         private Set<String> getColumn(int index) {
 582             while (columnContents.size() <= index) {
 583                 columnContents.add(new LinkedHashSet<>());
 584             }
 585 
 586             return columnContents.get(index);
 587         }
 588     }
 589 
 590     /**
 591      * A basic HTML parser.
 592      * Override the protected methods as needed to get notified of significant items
 593      * in any file that is read.
 594      */
 595     abstract class HtmlParser {
 596 
 597         private Path file;
 598         private Reader in;
 599         protected int ch;
 600         private int lineNumber;
 601         private boolean inScript;
 602         private boolean xml;
 603 
 604         /**
 605          * Read a file.
 606          * @param file the file
 607          */
 608         void read(Path file) {
 609             try (Reader r = Files.newBufferedReader(file)) {
 610                 this.file = file;
 611                 read(r);
 612             } catch (IOException e) {
 613                 error(file, -1, e);
 614             }
 615         }
 616 
 617         HtmlParser() { }
 618 
 619         /**
 620          * Read a stream.
 621          * @param r the stream
 622          */
 623         void read(Reader r) {
 624             try {
 625                 this.in = r;
 626                 StringBuilder content = new StringBuilder();
 627 
 628                 startFile(file);
 629                 try {
 630                     lineNumber = 1;
 631                     xml = false;
 632                     nextChar();
 633 
 634                     while (ch != -1) {
 635                         if (ch == '<') {
 636                             content(content.toString());
 637                             content.setLength(0);
 638                             html();
 639                         } else {
 640                             content.append((char) ch);
 641                             if (ch == '\n') {
 642                                 content(content.toString());
 643                                 content.setLength(0);
 644                             }
 645                             nextChar();
 646                         }
 647                     }
 648                 } finally {
 649                     endFile();
 650                 }
 651             } catch (IOException e) {
 652                 error(file, lineNumber, e);
 653             } catch (Throwable t) {
 654                 error(file, lineNumber, t);
 655                 t.printStackTrace(System.err);
 656             }
 657         }
 658 
 659         protected int getLineNumber() {
 660             return lineNumber;
 661         }
 662 
 663         /**
 664          * Called when a file has been opened, before parsing begins.
 665          * This is always the first notification when reading a file.
 666          * This implementation does nothing.
 667          *
 668          * @param file the file
 669          */
 670         protected void startFile(Path file) { }
 671 
 672         /**
 673          * Called when the parser has finished reading a file.
 674          * This is always the last notification when reading a file,
 675          * unless any errors occur while closing the file.
 676          * This implementation does nothing.
 677          */
 678         protected void endFile() { }
 679 
 680         /**
 681          * Called when a doctype declaration is found, at the beginning of the file.
 682          * This implementation does nothing.
 683          * @param s the doctype declaration
 684          */
 685         protected void doctype(String s) { }
 686 
 687         /**
 688          * Called when the opening tag of an HTML element is encountered.
 689          * This implementation does nothing.
 690          * @param name the name of the tag
 691          * @param attrs the attribute
 692          * @param selfClosing whether or not this is a self-closing tag
 693          */
 694         protected void startElement(String name, Map<String,String> attrs, boolean selfClosing) { }
 695 
 696         /**
 697          * Called when the closing tag of an HTML tag is encountered.
 698          * This implementation does nothing.
 699          * @param name the name of the tag
 700          */
 701         protected void endElement(String name) { }
 702 
 703         /**
 704          * Called for sequences of character content.
 705          * @param content the character content
 706          */
 707         protected void content(String content) { }
 708 
 709         /**
 710          * Called for sequences of comment.
 711          * @param comment the comment
 712          */
 713         protected void comment(String comment) { }
 714 
 715         /**
 716          * Called when an error has been encountered.
 717          * @param file the file being read
 718          * @param lineNumber the line number of line containing the error
 719          * @param message a description of the error
 720          */
 721         protected abstract void error(Path file, int lineNumber, String message);
 722 
 723         /**
 724          * Called when an exception has been encountered.
 725          * @param file the file being read
 726          * @param lineNumber the line number of the line being read when the exception was found
 727          * @param t the exception
 728          */
 729         protected abstract void error(Path file, int lineNumber, Throwable t);
 730 
 731         protected int nextChar() throws IOException {
 732             ch = in.read();
 733             if (ch == '\n')
 734                 lineNumber++;
 735             return ch;
 736         }
 737 
 738         /**
 739          * Read the start or end of an HTML tag, or an HTML comment
 740          * {@literal <identifier attrs> } or {@literal </identifier> }
 741          * @throws java.io.IOException if there is a problem reading the file
 742          */
 743         protected void html() throws IOException {
 744             nextChar();
 745             if (isIdentifierStart((char) ch)) {
 746                 String name = readIdentifier().toLowerCase(Locale.US);
 747                 Map<String,String> attrs = htmlAttrs();
 748                 if (attrs != null) {
 749                     boolean selfClosing = false;
 750                     if (ch == '/') {
 751                         nextChar();
 752                         selfClosing = true;
 753                     }
 754                     if (ch == '>') {
 755                         nextChar();
 756                         startElement(name, attrs, selfClosing);
 757                         if (name.equals("script")) {
 758                             inScript = true;
 759                         }
 760                         return;
 761                     }
 762                 }
 763             } else if (ch == '/') {
 764                 nextChar();
 765                 if (isIdentifierStart((char) ch)) {
 766                     String name = readIdentifier().toLowerCase(Locale.US);
 767                     skipWhitespace();
 768                     if (ch == '>') {
 769                         nextChar();
 770                         endElement(name);
 771                         if (name.equals("script")) {
 772                             inScript = false;
 773                         }
 774                         return;
 775                     }
 776                 }
 777             } else if (ch == '!') {
 778                 nextChar();
 779                 if (ch == '-') {
 780                     nextChar();
 781                     if (ch == '-') {
 782                         nextChar();
 783                         StringBuilder comment = new StringBuilder();
 784                         while (ch != -1) {
 785                             int dash = 0;
 786                             while (ch == '-') {
 787                                 dash++;
 788                                 comment.append(ch);
 789                                 nextChar();
 790                             }
 791                             // Strictly speaking, a comment should not contain "--"
 792                             // so dash > 2 is an error, dash == 2 implies ch == '>'
 793                             // See http://www.w3.org/TR/html-markup/syntax.html#syntax-comments
 794                             // for more details.
 795                             if (dash >= 2 && ch == '>') {
 796                                 comment.setLength(comment.length() - 2);
 797                                 comment(comment.toString());
 798                                 nextChar();
 799                                 return;
 800                             }
 801 
 802                             comment.append(ch);
 803                             nextChar();
 804                         }
 805                     }
 806                 } else if (ch == '[') {
 807                     nextChar();
 808                     if (ch == 'C') {
 809                         nextChar();
 810                         if (ch == 'D') {
 811                             nextChar();
 812                             if (ch == 'A') {
 813                                 nextChar();
 814                                 if (ch == 'T') {
 815                                     nextChar();
 816                                     if (ch == 'A') {
 817                                         nextChar();
 818                                         if (ch == '[') {
 819                                             while (true) {
 820                                                 nextChar();
 821                                                 if (ch == ']') {
 822                                                     nextChar();
 823                                                     if (ch == ']') {
 824                                                         nextChar();
 825                                                         if (ch == '>') {
 826                                                             nextChar();
 827                                                             return;
 828                                                         }
 829                                                     }
 830                                                 }
 831                                             }
 832 
 833                                         }
 834                                     }
 835                                 }
 836                             }
 837                         }
 838                     }
 839                 } else {
 840                     StringBuilder sb = new StringBuilder();
 841                     while (ch != -1 && ch != '>') {
 842                         sb.append((char) ch);
 843                         nextChar();
 844                     }
 845                     Pattern p = Pattern.compile("(?is)doctype\\s+html\\s?.*");
 846                     String s = sb.toString();
 847                     if (p.matcher(s).matches()) {
 848                         doctype(s);
 849                         return;
 850                     }
 851                 }
 852             } else if (ch == '?') {
 853                 nextChar();
 854                 if (ch == 'x') {
 855                     nextChar();
 856                     if (ch == 'm') {
 857                         nextChar();
 858                         if (ch == 'l') {
 859                             Map<String,String> attrs = htmlAttrs();
 860                             if (ch == '?') {
 861                                 nextChar();
 862                                 if (ch == '>') {
 863                                     nextChar();
 864                                     xml = true;
 865                                     return;
 866                                 }
 867                             }
 868                         }
 869                     }
 870 
 871                 }
 872             }
 873 
 874             if (!inScript) {
 875                 error(file, lineNumber, "bad html");
 876             }
 877         }
 878 
 879         /**
 880          * Read a series of HTML attributes, terminated by {@literal > }.
 881          * Each attribute is of the form {@literal identifier[=value] }.
 882          * "value" may be unquoted, single-quoted, or double-quoted.
 883          */
 884         private Map<String,String> htmlAttrs() throws IOException {
 885             Map<String, String> map = new LinkedHashMap<>();
 886             skipWhitespace();
 887 
 888             while (isIdentifierStart((char) ch)) {
 889                 String name = readAttributeName().toLowerCase(Locale.US);
 890                 skipWhitespace();
 891                 String value = null;
 892                 if (ch == '=') {
 893                     nextChar();
 894                     skipWhitespace();
 895                     if (ch == '\'' || ch == '"') {
 896                         char quote = (char) ch;
 897                         nextChar();
 898                         StringBuilder sb = new StringBuilder();
 899                         while (ch != -1 && ch != quote) {
 900                             sb.append((char) ch);
 901                             nextChar();
 902                         }
 903                         value = sb.toString() // hack to replace common entities
 904                                 .replace("&lt;", "<")
 905                                 .replace("&gt;", ">")
 906                                 .replace("&amp;", "&");
 907                         nextChar();
 908                     } else {
 909                         StringBuilder sb = new StringBuilder();
 910                         while (ch != -1 && !isUnquotedAttrValueTerminator((char) ch)) {
 911                             sb.append((char) ch);
 912                             nextChar();
 913                         }
 914                         value = sb.toString();
 915                     }
 916                     skipWhitespace();
 917                 }
 918                 map.put(name, value);
 919             }
 920 
 921             return map;
 922         }
 923 
 924         private boolean isIdentifierStart(char ch) {
 925             return Character.isUnicodeIdentifierStart(ch);
 926         }
 927 
 928         private String readIdentifier() throws IOException {
 929             StringBuilder sb = new StringBuilder();
 930             sb.append((char) ch);
 931             nextChar();
 932             while (ch != -1 && Character.isUnicodeIdentifierPart(ch)) {
 933                 sb.append((char) ch);
 934                 nextChar();
 935             }
 936             return sb.toString();
 937         }
 938 
 939         private String readAttributeName() throws IOException {
 940             StringBuilder sb = new StringBuilder();
 941             sb.append((char) ch);
 942             nextChar();
 943             while (ch != -1 && Character.isUnicodeIdentifierPart(ch)
 944                     || ch == '-'
 945                     || (xml || sb.toString().startsWith("xml")) && ch == ':') {
 946                 sb.append((char) ch);
 947                 nextChar();
 948             }
 949             return sb.toString();
 950         }
 951 
 952         private boolean isWhitespace(char ch) {
 953             return Character.isWhitespace(ch);
 954         }
 955 
 956         private void skipWhitespace() throws IOException {
 957             while (isWhitespace((char) ch)) {
 958                 nextChar();
 959             }
 960         }
 961 
 962         private boolean isUnquotedAttrValueTerminator(char ch) {
 963             switch (ch) {
 964                 case '\f': case '\n': case '\r': case '\t':
 965                 case ' ':
 966                 case '"': case '\'': case '`':
 967                 case '=': case '<': case '>':
 968                     return true;
 969                 default:
 970                     return false;
 971             }
 972         }
 973     }
 974 
 975 }