< prev index next >

src/java.base/share/classes/java/util/regex/Pattern.java

Print this page




2083                         // when put the "prev" into the branch as the first atom.
2084                         firstTail.next = branchConn;
2085                     }
2086                     prev = branch = new Branch(prev, node, branchConn);
2087                 }
2088             }
2089             if (peek() != '|') {
2090                 return prev;
2091             }
2092             next();
2093         }
2094     }
2095 
2096     @SuppressWarnings("fallthrough")
2097     /**
2098      * Parsing of sequences between alternations.
2099      */
2100     private Node sequence(Node end) {
2101         Node head = null;
2102         Node tail = null;
2103         Node node;
2104     LOOP:
2105         for (;;) {
2106             int ch = peek();
2107             switch (ch) {
2108             case '(':
2109                 // Because group handles its own closure,
2110                 // we need to treat it differently
2111                 node = group0();
2112                 // Check for comment or flag group
2113                 if (node == null)
2114                     continue;
2115                 if (head == null)
2116                     head = node;
2117                 else
2118                     tail.next = node;
2119                 // Double return: Tail was returned in root
2120                 tail = root;
2121                 continue;
2122             case '[':
2123                 if (has(CANON_EQ) && !has(LITERAL))


2600             if (inclass) break;
2601             if (create) root = new End();
2602             return -1;
2603         default:
2604             return ch;
2605         }
2606         throw error("Illegal/unsupported escape sequence");
2607     }
2608 
2609     /**
2610      * Parse a character class, and return the node that matches it.
2611      *
2612      * Consumes a ] on the way out if consume is true. Usually consume
2613      * is true except for the case of [abc&&def] where def is a separate
2614      * right hand node with "understood" brackets.
2615      */
2616     private CharPredicate clazz(boolean consume) {
2617         CharPredicate prev = null;
2618         CharPredicate curr = null;
2619         BitClass bits = new BitClass();

2620 
2621         boolean isNeg = false;
2622         boolean hasBits = false;
2623         int ch = next();
2624 
2625         // Negates if first char in a class, otherwise literal
2626         if (ch == '^' && temp[cursor-1] == '[') {
2627             ch = next();
2628             isNeg = true;
2629         }
2630         for (;;) {
2631             switch (ch) {
2632                 case '[':
2633                     curr = clazz(true);
2634                     if (prev == null)
2635                         prev = curr;
2636                     else
2637                         prev = prev.union(curr);
2638                     ch = peek();
2639                     continue;
2640                 case '&':
2641                     ch = next();
2642                     if (ch == '&') {
2643                         ch = next();
2644                         CharPredicate right = null;
2645                         while (ch != ']' && ch != '&') {
2646                             if (ch == '[') {
2647                                 if (right == null)
2648                                     right = clazz(true);
2649                                 else
2650                                     right = right.union(clazz(true));
2651                             } else { // abc&&def
2652                                 unread();
2653                                 right = clazz(false);
2654                             }
2655                             ch = peek();
2656                         }
2657                         if (hasBits) {
2658                             // bits used, union has high precedence
2659                             if (prev == null) {
2660                                 prev = curr = bits;
2661                             } else {
2662                                 prev = prev.union(bits);
2663                             }
2664                             hasBits = false;
2665                         }
2666                         if (right != null)
2667                             curr = right;
2668                         if (prev == null) {
2669                             if (right == null)
2670                                 throw error("Bad class syntax");
2671                             else
2672                                 prev = right;
2673                         } else {
2674                             prev = prev.and(curr);
2675                         }
2676                     } else {
2677                         // treat as a literal &
2678                         unread();
2679                         break;
2680                     }
2681                     continue;
2682                 case 0:
2683                     if (cursor >= patternLength)
2684                         throw error("Unclosed character class");
2685                     break;
2686                 case ']':
2687                     if (prev != null || hasBits) {
2688                         if (consume)
2689                             next();
2690                         if (prev == null)
2691                             prev = bits;
2692                         else if (hasBits)
2693                             prev = prev.union(bits);
2694                         if (isNeg)
2695                             return prev.negate();
2696                         return prev;
2697                     }
2698                     break;
2699                 default:
2700                     break;
2701             }
2702             curr = range(bits);
2703             if (curr == null) {    // the bits used
2704                 hasBits = true;
2705             } else {
2706                 if (prev == null)
2707                     prev = curr;
2708                 else if (prev != curr)
2709                     prev = prev.union(curr);
2710             }
2711             ch = peek();
2712         }
2713     }


2929      */
2930     private String groupname(int ch) {
2931         StringBuilder sb = new StringBuilder();
2932         if (!ASCII.isAlpha(ch))
2933             throw error("capturing group name does not start with a Latin letter");
2934         do {
2935             sb.append((char) ch);
2936         } while (ASCII.isAlnum(ch=read()));
2937         if (ch != '>')
2938             throw error("named capturing group is missing trailing '>'");
2939         return sb.toString();
2940     }
2941 
2942     /**
2943      * Parses a group and returns the head node of a set of nodes that process
2944      * the group. Sometimes a double return system is used where the tail is
2945      * returned in root.
2946      */
2947     private Node group0() {
2948         boolean capturingGroup = false;
2949         Node head;
2950         Node tail;
2951         int save = flags0;
2952         int saveTCNCount = topClosureNodes.size();
2953         root = null;
2954         int ch = next();
2955         if (ch == '?') {
2956             ch = skip();
2957             switch (ch) {
2958             case ':':   //  (?:xxx) pure group
2959                 head = createGroup(true);
2960                 tail = root;
2961                 head.next = expr(tail);
2962                 break;
2963             case '=':   // (?=xxx) and (?!xxx) lookahead
2964             case '!':
2965                 head = createGroup(true);
2966                 tail = root;
2967                 head.next = expr(tail);
2968                 if (ch == '=') {
2969                     head = tail = new Pos(head);
2970                 } else {


2979                 break;
2980             case '<':   // (?<xxx)  look behind
2981                 ch = read();
2982                 if (ch != '=' && ch != '!') {
2983                     // named captured group
2984                     String name = groupname(ch);
2985                     if (namedGroups().containsKey(name))
2986                         throw error("Named capturing group <" + name
2987                                     + "> is already defined");
2988                     capturingGroup = true;
2989                     head = createGroup(false);
2990                     tail = root;
2991                     namedGroups().put(name, capturingGroupCount-1);
2992                     head.next = expr(tail);
2993                     break;
2994                 }
2995                 int start = cursor;
2996                 head = createGroup(true);
2997                 tail = root;
2998                 head.next = expr(tail);
2999                 tail.next = LookBehindEndNode.INSTANCE;
3000                 TreeInfo info = new TreeInfo();
3001                 head.study(info);
3002                 if (info.maxValid == false) {
3003                     throw error("Look-behind group does not have "
3004                                 + "an obvious maximum length");
3005                 }
3006                 boolean hasSupplementary = findSupplementary(start, patternLength);
3007                 if (ch == '=') {
3008                     head = tail = (hasSupplementary ?
3009                                    new BehindS(head, info.maxLength,
3010                                                info.minLength) :
3011                                    new Behind(head, info.maxLength,
3012                                               info.minLength));
3013                 } else { // if (ch == '!')
3014                     head = tail = (hasSupplementary ?
3015                                    new NotBehindS(head, info.maxLength,
3016                                                   info.minLength) :
3017                                    new NotBehind(head, info.maxLength,
3018                                                  info.minLength));
3019                 }


3235             next();
3236             return new Curly(prev, cmin, MAX_REPS, Qtype.LAZY);
3237         } else if (ch == '+') {
3238             next();
3239             return new Curly(prev, cmin, MAX_REPS, Qtype.POSSESSIVE);
3240         }
3241         if (prev instanceof BmpCharProperty) {
3242             return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin);
3243         } else if (prev instanceof CharProperty) {
3244             return new CharPropertyGreedy((CharProperty)prev, cmin);
3245         }
3246         return new Curly(prev, cmin, MAX_REPS, Qtype.GREEDY);
3247     }
3248 
3249     /**
3250      * Processes repetition. If the next character peeked is a quantifier
3251      * then new nodes must be appended to handle the repetition.
3252      * Prev could be a single or a group, so it could be a chain of nodes.
3253      */
3254     private Node closure(Node prev) {

3255         int ch = peek();
3256         switch (ch) {
3257         case '?':
3258             ch = next();
3259             if (ch == '?') {
3260                 next();
3261                 return new Ques(prev, Qtype.LAZY);
3262             } else if (ch == '+') {
3263                 next();
3264                 return new Ques(prev, Qtype.POSSESSIVE);
3265             }
3266             return new Ques(prev, Qtype.GREEDY);
3267         case '*':
3268             return curly(prev, 0);
3269         case '+':
3270             return curly(prev, 1);
3271         case '{':
3272             ch = skip();
3273             if (ASCII.isDigit(ch)) {
3274                 int cmin = 0, cmax;
3275                 try {
3276                     do {
3277                         cmin = Math.addExact(Math.multiplyExact(cmin, 10),
3278                                              ch - '0');
3279                     } while (ASCII.isDigit(ch = read()));

3280                     if (ch == ',') {
3281                         ch = read();
3282                         if (ch == '}') {
3283                             unread();
3284                             return curly(prev, cmin);
3285                         } else {
3286                             cmax = 0;
3287                             while (ASCII.isDigit(ch)) {
3288                                 cmax = Math.addExact(Math.multiplyExact(cmax, 10),
3289                                                      ch - '0');
3290                                 ch = read();
3291                             }
3292                         }
3293                     } else {
3294                         cmax = cmin;
3295                     }
3296                 } catch (ArithmeticException ae) {
3297                     throw error("Illegal repetition range");
3298                 }
3299                 if (ch != '}')
3300                     throw error("Unclosed counted closure");
3301                 if (cmax < cmin)
3302                     throw error("Illegal repetition range");

3303                 ch = peek();
3304                 if (ch == '?') {
3305                     next();
3306                     return new Curly(prev, cmin, cmax, Qtype.LAZY);
3307                 } else if (ch == '+') {
3308                     next();
3309                     return new Curly(prev, cmin, cmax, Qtype.POSSESSIVE);
3310                 } else {
3311                     return new Curly(prev, cmin, cmax, Qtype.GREEDY);
3312                 }

3313             } else {
3314                 throw error("Illegal repetition");
3315             }
3316         default:
3317             return prev;
3318         }
3319     }
3320 
3321     /**
3322      *  Utility method for parsing control escape sequences.
3323      */
3324     private int c() {
3325         if (cursor < patternLength) {
3326             return read() ^ 64;
3327         }
3328         throw error("Illegal control escape sequence");
3329     }
3330 
3331     /**
3332      *  Utility method for parsing octal escape sequences.


3467 
3468     private static final int countCodePoints(CharSequence seq) {
3469         int length = seq.length();
3470         int n = 0;
3471         for (int i = 0; i < length; ) {
3472             n++;
3473             if (Character.isHighSurrogate(seq.charAt(i++))) {
3474                 if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
3475                     i++;
3476                 }
3477             }
3478         }
3479         return n;
3480     }
3481 
3482     /**
3483      *  Creates a bit vector for matching Latin-1 values. A normal BitClass
3484      *  never matches values above Latin-1, and a complemented BitClass always
3485      *  matches values above Latin-1.
3486      */
3487     static final class BitClass implements BmpCharPredicate {
3488         final boolean[] bits;
3489         BitClass() {
3490             bits = new boolean[256];




3491         }
3492         BitClass add(int c, int flags) {
3493             assert c >= 0 && c <= 255;
3494             if ((flags & CASE_INSENSITIVE) != 0) {
3495                 if (ASCII.isAscii(c)) {
3496                     bits[ASCII.toUpper(c)] = true;
3497                     bits[ASCII.toLower(c)] = true;
3498                 } else if ((flags & UNICODE_CASE) != 0) {
3499                     bits[Character.toLowerCase(c)] = true;
3500                     bits[Character.toUpperCase(c)] = true;
3501                 }
3502             }
3503             bits[c] = true;
3504             return this;
3505         }
3506         public boolean is(int ch) {
3507             return ch < 256 && bits[ch];
3508         }
3509     }
3510 
3511 
3512     /**
3513      *  Utility method for creating a string slice matcher.
3514      */
3515     private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
3516         int[] tmp = new int[count];
3517         if (has(CASE_INSENSITIVE)) {
3518             if (has(UNICODE_CASE)) {
3519                 for (int i = 0; i < count; i++) {
3520                     tmp[i] = Character.toLowerCase(
3521                                  Character.toUpperCase(buf[i]));
3522                 }
3523                 return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);
3524             }
3525             for (int i = 0; i < count; i++) {
3526                 tmp[i] = ASCII.toLower(buf[i]);
3527             }
3528             return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);
3529         }
3530         for (int i = 0; i < count; i++) {
3531             tmp[i] = buf[i];


3904                     }
3905                     return next.match(matcher, i, seq);
3906                 }
3907             } else {
3908                 matcher.hitEnd = true;
3909             }
3910             return false;
3911         }
3912         boolean study(TreeInfo info) {
3913             info.minLength++;
3914             info.maxLength += 2;
3915             return next.study(info);
3916         }
3917     }
3918 
3919     /**
3920      * Abstract node class to match one character satisfying some
3921      * boolean property.
3922      */
3923     static class CharProperty extends Node {
3924         final CharPredicate predicate;
3925 
3926         CharProperty (CharPredicate predicate) {
3927             this.predicate = predicate;
3928         }
3929         boolean match(Matcher matcher, int i, CharSequence seq) {
3930             if (i < matcher.to) {
3931                 int ch = Character.codePointAt(seq, i);
3932                 return predicate.is(ch) &&
3933                        next.match(matcher, i + Character.charCount(ch), seq);
3934             } else {
3935                 matcher.hitEnd = true;
3936                 return false;
3937             }
3938         }
3939         boolean study(TreeInfo info) {
3940             info.minLength++;
3941             info.maxLength++;
3942             return next.study(info);
3943         }
3944     }


3955             if (i < matcher.to) {
3956                 return predicate.is(seq.charAt(i)) &&
3957                        next.match(matcher, i + 1, seq);
3958             } else {
3959                 matcher.hitEnd = true;
3960                 return false;
3961             }
3962         }
3963     }
3964 
3965     private static class NFCCharProperty extends Node {
3966         CharPredicate predicate;
3967         NFCCharProperty (CharPredicate predicate) {
3968             this.predicate = predicate;
3969         }
3970 
3971         boolean match(Matcher matcher, int i, CharSequence seq) {
3972             if (i < matcher.to) {
3973                 int ch0 = Character.codePointAt(seq, i);
3974                 int n = Character.charCount(ch0);
3975                 int j = i + n;
3976                 // Fast check if it's necessary to call Normalizer;
3977                 // testing Grapheme.isBoundary is enough for this case
3978                 while (j < matcher.to) {
3979                     int ch1 = Character.codePointAt(seq, j);
3980                     if (Grapheme.isBoundary(ch0, ch1))
3981                         break;
3982                     ch0 = ch1;
3983                     j += Character.charCount(ch1);
3984                 }
3985                 if (i + n == j) {    // single, assume nfc cp
3986                     if (predicate.is(ch0))
3987                         return next.match(matcher, j, seq);
3988                 } else {
3989                     while (i + n < j) {
3990                         String nfc = Normalizer.normalize(
3991                             seq.toString().substring(i, j), Normalizer.Form.NFC);
3992                         if (nfc.codePointCount(0, nfc.length()) == 1) {
3993                             if (predicate.is(nfc.codePointAt(0)) &&
3994                                 next.match(matcher, j, seq)) {
3995                                 return true;
3996                             }
3997                         }
3998 
3999                         ch0 = Character.codePointBefore(seq, j);
4000                         j -= Character.charCount(ch0);
4001                     }
4002                 }
4003                 if (j < matcher.to)
4004                     return false;


4248                 return next.match(matcher, i, seq);
4249             default:
4250                 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
4251             }
4252         }
4253         boolean study(TreeInfo info) {
4254             if (type != Qtype.INDEPENDENT) {
4255                 int minL = info.minLength;
4256                 atom.study(info);
4257                 info.minLength = minL;
4258                 info.deterministic = false;
4259                 return next.study(info);
4260             } else {
4261                 atom.study(info);
4262                 return next.study(info);
4263             }
4264         }
4265     }
4266 
4267     /**
4268      * Handles the greedy style repetition with the specified minimum
4269      * and the maximum equal to MAX_REPS, for *, + and {N,} quantifiers.
4270      */
4271     static class CharPropertyGreedy extends Node {
4272         final CharPredicate predicate;
4273         final int cmin;
4274 
4275         CharPropertyGreedy(CharProperty cp, int cmin) {
4276             this.predicate = cp.predicate;
4277             this.cmin = cmin;
4278         }
4279         boolean match(Matcher matcher, int i, CharSequence seq) {
4280             int n = 0;
4281             int to = matcher.to;
4282             // greedy, all the way down
4283             while (i < to) {
4284                 int ch = Character.codePointAt(seq, i);
4285                 if (!predicate.is(ch))
4286                    break;
4287                 i += Character.charCount(ch);
4288                 n++;
4289             }
4290             if (i >= to) {
4291                 matcher.hitEnd = true;
4292             }
4293             while (n >= cmin) {
4294                 if (next.match(matcher, i, seq))
4295                     return true;
4296                 if (n == cmin)
4297                     return false;
4298                  // backing off if match fails
4299                 int ch = Character.codePointBefore(seq, i);


4302             }
4303             return false;
4304         }
4305 
4306         boolean study(TreeInfo info) {
4307             info.minLength += cmin;
4308             if (info.maxValid) {
4309                 info.maxLength += MAX_REPS;
4310             }
4311             info.deterministic = false;
4312             return next.study(info);
4313         }
4314     }
4315 
4316     static final class BmpCharPropertyGreedy extends CharPropertyGreedy {
4317 
4318         BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) {
4319             super(bcp, cmin);
4320         }
4321 
4322         boolean match(Matcher matcher, int i, CharSequence seq) {
4323             int n = 0;
4324             int to = matcher.to;
4325             while (i < to && predicate.is(seq.charAt(i))) {
4326                 i++; n++;
4327             }
4328             if (i >= to) {
4329                 matcher.hitEnd = true;
4330             }
4331             while (n >= cmin) {
4332                 if (next.match(matcher, i, seq))
4333                     return true;
4334                 i--; n--;  // backing off if match fails
4335             }
4336             return false;
4337         }
4338     }
4339 
4340     /**
4341      * Handles the curly-brace style repetition with a specified minimum and
4342      * maximum occurrences. The * quantifier is handled as a special case.


4679                 info.maxValid = false;
4680             }
4681 
4682             if (info.deterministic && cmin == cmax) {
4683                 info.deterministic = detm;
4684             } else {
4685                 info.deterministic = false;
4686             }
4687             return next.study(info);
4688         }
4689     }
4690 
4691     /**
4692      * A Guard node at the end of each atom node in a Branch. It
4693      * serves the purpose of chaining the "match" operation to
4694      * "next" but not the "study", so we can collect the TreeInfo
4695      * of each atom node without including the TreeInfo of the
4696      * "next".
4697      */
4698     static final class BranchConn extends Node {
4699         BranchConn() {}
4700         boolean match(Matcher matcher, int i, CharSequence seq) {
4701             return next.match(matcher, i, seq);
4702         }
4703         boolean study(TreeInfo info) {
4704             return info.deterministic;
4705         }
4706     }
4707 
4708     /**
4709      * Handles the branching of alternations. Note this is also used for
4710      * the ? quantifier to branch between the case where it matches once
4711      * and where it does not occur.
4712      */
4713     static final class Branch extends Node {
4714         Node[] atoms = new Node[2];
4715         int size = 2;
4716         Node conn;
4717         Branch(Node first, Node second, Node branchConn) {
4718             conn = branchConn;
4719             atoms[0] = first;


4776      * and restores them when the match is done.
4777      *
4778      * The matchRef is used when a reference to this group is accessed later
4779      * in the expression. The locals will have a negative value in them to
4780      * indicate that we do not want to unset the group if the reference
4781      * doesn't match.
4782      */
4783     static final class GroupHead extends Node {
4784         int localIndex;
4785         GroupTail tail;    // for debug/print only, match does not need to know
4786         GroupHead(int localCount) {
4787             localIndex = localCount;
4788         }
4789         boolean match(Matcher matcher, int i, CharSequence seq) {
4790             int save = matcher.locals[localIndex];
4791             matcher.locals[localIndex] = i;
4792             boolean ret = next.match(matcher, i, seq);
4793             matcher.locals[localIndex] = save;
4794             return ret;
4795         }




























4796     }
4797 
4798     /**
4799      * The GroupTail handles the setting of group beginning and ending
4800      * locations when groups are successfully matched. It must also be able to
4801      * unset groups that have to be backed off of.
4802      *
4803      * The GroupTail node is also used when a previous group is referenced,
4804      * and in that case no group information needs to be set.
4805      */
4806     static final class GroupTail extends Node {
4807         int localIndex;
4808         int groupIndex;
4809         GroupTail(int localCount, int groupCount) {
4810             localIndex = localCount;
4811             groupIndex = groupCount + groupCount;
4812         }
4813         boolean match(Matcher matcher, int i, CharSequence seq) {
4814             int tmp = matcher.locals[localIndex];
4815             if (tmp >= 0) { // This is the normal group case.


4897                         matcher.localsPos[posIndex].contains(i)) {
4898                         return next.match(matcher, i, seq);
4899                     }
4900                     matcher.locals[countIndex] = count + 1;
4901                     boolean b = body.match(matcher, i, seq);
4902                     // If match failed we must backtrack, so
4903                     // the loop count should NOT be incremented
4904                     if (b)
4905                         return true;
4906                     matcher.locals[countIndex] = count;
4907                     // save the failed position
4908                     if (posIndex != -1) {
4909                         matcher.localsPos[posIndex].add(i);
4910                     }
4911                 }
4912             }
4913             return next.match(matcher, i, seq);
4914         }
4915         boolean matchInit(Matcher matcher, int i, CharSequence seq) {
4916             int save = matcher.locals[countIndex];
4917             boolean ret;
4918             if (posIndex != -1 && matcher.localsPos[posIndex] == null) {
4919                 matcher.localsPos[posIndex] = new IntHashSet();
4920             }
4921             if (0 < cmin) {
4922                 matcher.locals[countIndex] = 1;
4923                 ret = body.match(matcher, i, seq);
4924             } else if (0 < cmax) {
4925                 matcher.locals[countIndex] = 1;
4926                 ret = body.match(matcher, i, seq);
4927                 if (ret == false)
4928                     ret = next.match(matcher, i, seq);
4929             } else {
4930                 ret = next.match(matcher, i, seq);
4931             }
4932             matcher.locals[countIndex] = save;
4933             return ret;
4934         }
4935         boolean study(TreeInfo info) {
4936             info.maxValid = false;
4937             info.deterministic = false;


5111             for (;;) {
5112                 if (i > matcher.to) {
5113                     matcher.hitEnd = true;
5114                     return false;
5115                 }
5116                 if (atom.match(matcher, i, seq)) {
5117                     return next.match(matcher, matcher.last, seq);
5118                 }
5119                 i += countChars(seq, i, 1);
5120                 matcher.first++;
5121             }
5122         }
5123         boolean study(TreeInfo info) {
5124             atom.study(info);
5125             info.maxValid = false;
5126             info.deterministic = false;
5127             return next.study(info);
5128         }
5129     }
5130 



































5131     /**
5132      * Zero width positive lookahead.
5133      */
5134     static final class Pos extends Node {
5135         Node cond;
5136         Pos(Node cond) {
5137             this.cond = cond;
5138         }
5139         boolean match(Matcher matcher, int i, CharSequence seq) {
5140             int savedTo = matcher.to;
5141             boolean conditionMatched;
5142 
5143             // Relax transparent region boundaries for lookahead
5144             if (matcher.transparentBounds)
5145                 matcher.to = matcher.getTextLength();
5146             try {
5147                 conditionMatched = cond.match(matcher, i, seq);
5148             } finally {
5149                 // Reinstate region boundaries
5150                 matcher.to = savedTo;
5151             }
5152             return conditionMatched && next.match(matcher, i, seq);
5153         }
5154     }
5155 
5156     /**
5157      * Zero width negative lookahead.
5158      */
5159     static final class Neg extends Node {
5160         Node cond;
5161         Neg(Node cond) {
5162             this.cond = cond;
5163         }
5164         boolean match(Matcher matcher, int i, CharSequence seq) {
5165             int savedTo = matcher.to;
5166             boolean conditionMatched;
5167 
5168             // Relax transparent region boundaries for lookahead
5169             if (matcher.transparentBounds)
5170                 matcher.to = matcher.getTextLength();
5171             try {
5172                 if (i < matcher.to) {
5173                     conditionMatched = !cond.match(matcher, i, seq);
5174                 } else {
5175                     // If a negative lookahead succeeds then more input
5176                     // could cause it to fail!
5177                     matcher.requireEnd = true;
5178                     conditionMatched = !cond.match(matcher, i, seq);
5179                 }
5180             } finally {
5181                 // Reinstate region boundaries
5182                 matcher.to = savedTo;
5183             }
5184             return conditionMatched && next.match(matcher, i, seq);
5185         }
5186     }
5187 
5188     /**
5189      * For use with lookbehinds; matches the position where the lookbehind
5190      * was encountered.
5191      */
5192     static class LookBehindEndNode extends Node {
5193         private LookBehindEndNode() {} // Singleton
5194 
5195         static LookBehindEndNode INSTANCE = new LookBehindEndNode();
5196 
5197         boolean match(Matcher matcher, int i, CharSequence seq) {
5198             return i == matcher.lookbehindTo;
5199         }
5200     }
5201 
5202     /**
5203      * Zero width positive lookbehind.
5204      */
5205     static class Behind extends Node {
5206         Node cond;
5207         int rmax, rmin;
5208         Behind(Node cond, int rmax, int rmin) {
5209             this.cond = cond;
5210             this.rmax = rmax;
5211             this.rmin = rmin;
5212         }
5213 
5214         boolean match(Matcher matcher, int i, CharSequence seq) {
5215             int savedFrom = matcher.from;
5216             boolean conditionMatched = false;
5217             int startIndex = (!matcher.transparentBounds) ?
5218                              matcher.from : 0;
5219             int from = Math.max(i - rmax, startIndex);
5220             // Set end boundary


5448          * Pre calculates arrays needed to generate the bad character
5449          * shift and the good suffix shift. Only the last seven bits
5450          * are used to see if chars match; This keeps the tables small
5451          * and covers the heavily used ASCII range, but occasionally
5452          * results in an aliased match for the bad character shift.
5453          */
5454         static Node optimize(Node node) {
5455             if (!(node instanceof Slice)) {
5456                 return node;
5457             }
5458 
5459             int[] src = ((Slice) node).buffer;
5460             int patternLength = src.length;
5461             // The BM algorithm requires a bit of overhead;
5462             // If the pattern is short don't use it, since
5463             // a shift larger than the pattern length cannot
5464             // be used anyway.
5465             if (patternLength < 4) {
5466                 return node;
5467             }
5468             int i, j;
5469             int[] lastOcc = new int[128];
5470             int[] optoSft = new int[patternLength];
5471             // Precalculate part of the bad character shift
5472             // It is a table for where in the pattern each
5473             // lower 7-bit value occurs
5474             for (i = 0; i < patternLength; i++) {
5475                 lastOcc[src[i]&0x7F] = i + 1;
5476             }
5477             // Precalculate the good suffix shift
5478             // i is the shift amount being considered
5479 NEXT:       for (i = patternLength; i > 0; i--) {
5480                 // j is the beginning index of suffix being considered
5481                 for (j = patternLength - 1; j >= i; j--) {
5482                     // Testing for good suffix
5483                     if (src[j] == src[j-i]) {
5484                         // src[j..len] is a good suffix
5485                         optoSft[j-1] = i;
5486                     } else {
5487                         // No match. The array has already been
5488                         // filled up with correct values before.


5603         boolean is(int ch);
5604 
5605         default CharPredicate and(CharPredicate p) {
5606             return ch -> is(ch) && p.is(ch);
5607         }
5608         default CharPredicate union(CharPredicate p) {
5609             return ch -> is(ch) || p.is(ch);
5610         }
5611         default CharPredicate union(CharPredicate p1,
5612                                     CharPredicate p2 ) {
5613             return ch -> is(ch) || p1.is(ch) || p2.is(ch);
5614         }
5615         default CharPredicate negate() {
5616             return ch -> !is(ch);
5617         }
5618     }
5619 
5620     static interface BmpCharPredicate extends CharPredicate {
5621 
5622         default CharPredicate and(CharPredicate p) {
5623             if (p instanceof BmpCharPredicate)
5624                 return (BmpCharPredicate)(ch -> is(ch) && p.is(ch));
5625             return ch -> is(ch) && p.is(ch);
5626         }
5627         default CharPredicate union(CharPredicate p) {
5628             if (p instanceof BmpCharPredicate)
5629                 return (BmpCharPredicate)(ch -> is(ch) || p.is(ch));
5630             return ch -> is(ch) || p.is(ch);
5631         }
5632         static CharPredicate union(CharPredicate... predicates) {
5633             CharPredicate cp = ch -> {
5634                 for (CharPredicate p : predicates) {
5635                     if (!p.is(ch))
5636                         return false;
5637                 }
5638                 return true;
5639             };
5640             for (CharPredicate p : predicates) {
5641                 if (! (p instanceof BmpCharPredicate))
5642                     return cp;
5643             }




2083                         // when put the "prev" into the branch as the first atom.
2084                         firstTail.next = branchConn;
2085                     }
2086                     prev = branch = new Branch(prev, node, branchConn);
2087                 }
2088             }
2089             if (peek() != '|') {
2090                 return prev;
2091             }
2092             next();
2093         }
2094     }
2095 
2096     @SuppressWarnings("fallthrough")
2097     /**
2098      * Parsing of sequences between alternations.
2099      */
2100     private Node sequence(Node end) {
2101         Node head = null;
2102         Node tail = null;
2103         Node node = null;
2104     LOOP:
2105         for (;;) {
2106             int ch = peek();
2107             switch (ch) {
2108             case '(':
2109                 // Because group handles its own closure,
2110                 // we need to treat it differently
2111                 node = group0();
2112                 // Check for comment or flag group
2113                 if (node == null)
2114                     continue;
2115                 if (head == null)
2116                     head = node;
2117                 else
2118                     tail.next = node;
2119                 // Double return: Tail was returned in root
2120                 tail = root;
2121                 continue;
2122             case '[':
2123                 if (has(CANON_EQ) && !has(LITERAL))


2600             if (inclass) break;
2601             if (create) root = new End();
2602             return -1;
2603         default:
2604             return ch;
2605         }
2606         throw error("Illegal/unsupported escape sequence");
2607     }
2608 
2609     /**
2610      * Parse a character class, and return the node that matches it.
2611      *
2612      * Consumes a ] on the way out if consume is true. Usually consume
2613      * is true except for the case of [abc&&def] where def is a separate
2614      * right hand node with "understood" brackets.
2615      */
2616     private CharPredicate clazz(boolean consume) {
2617         CharPredicate prev = null;
2618         CharPredicate curr = null;
2619         BitClass bits = new BitClass();
2620         BmpCharPredicate bitsP = ch -> ch < 256 && bits.bits[ch];
2621 
2622         boolean isNeg = false;
2623         boolean hasBits = false;
2624         int ch = next();
2625 
2626         // Negates if first char in a class, otherwise literal
2627         if (ch == '^' && temp[cursor-1] == '[') {
2628             ch = next();
2629             isNeg = true;
2630         }
2631         for (;;) {
2632             switch (ch) {
2633                 case '[':
2634                     curr = clazz(true);
2635                     if (prev == null)
2636                         prev = curr;
2637                     else
2638                         prev = prev.union(curr);
2639                     ch = peek();
2640                     continue;
2641                 case '&':
2642                     ch = next();
2643                     if (ch == '&') {
2644                         ch = next();
2645                         CharPredicate right = null;
2646                         while (ch != ']' && ch != '&') {
2647                             if (ch == '[') {
2648                                 if (right == null)
2649                                     right = clazz(true);
2650                                 else
2651                                     right = right.union(clazz(true));
2652                             } else { // abc&&def
2653                                 unread();
2654                                 right = clazz(false);
2655                             }
2656                             ch = peek();
2657                         }
2658                         if (hasBits) {
2659                             // bits used, union has high precedence
2660                             if (prev == null) {
2661                                 prev = curr = bitsP;
2662                             } else {
2663                                 prev = prev.union(bitsP);
2664                             }
2665                             hasBits = false;
2666                         }
2667                         if (right != null)
2668                             curr = right;
2669                         if (prev == null) {
2670                             if (right == null)
2671                                 throw error("Bad class syntax");
2672                             else
2673                                 prev = right;
2674                         } else {
2675                             prev = prev.and(curr);
2676                         }
2677                     } else {
2678                         // treat as a literal &
2679                         unread();
2680                         break;
2681                     }
2682                     continue;
2683                 case 0:
2684                     if (cursor >= patternLength)
2685                         throw error("Unclosed character class");
2686                     break;
2687                 case ']':
2688                     if (prev != null || hasBits) {
2689                         if (consume)
2690                             next();
2691                         if (prev == null)
2692                             prev = bitsP;
2693                         else if (hasBits)
2694                             prev = prev.union(bitsP);
2695                         if (isNeg)
2696                             return prev.negate();
2697                         return prev;
2698                     }
2699                     break;
2700                 default:
2701                     break;
2702             }
2703             curr = range(bits);
2704             if (curr == null) {    // the bits used
2705                 hasBits = true;
2706             } else {
2707                 if (prev == null)
2708                     prev = curr;
2709                 else if (prev != curr)
2710                     prev = prev.union(curr);
2711             }
2712             ch = peek();
2713         }
2714     }


2930      */
2931     private String groupname(int ch) {
2932         StringBuilder sb = new StringBuilder();
2933         if (!ASCII.isAlpha(ch))
2934             throw error("capturing group name does not start with a Latin letter");
2935         do {
2936             sb.append((char) ch);
2937         } while (ASCII.isAlnum(ch=read()));
2938         if (ch != '>')
2939             throw error("named capturing group is missing trailing '>'");
2940         return sb.toString();
2941     }
2942 
2943     /**
2944      * Parses a group and returns the head node of a set of nodes that process
2945      * the group. Sometimes a double return system is used where the tail is
2946      * returned in root.
2947      */
2948     private Node group0() {
2949         boolean capturingGroup = false;
2950         Node head = null;
2951         Node tail = null;
2952         int save = flags0;
2953         int saveTCNCount = topClosureNodes.size();
2954         root = null;
2955         int ch = next();
2956         if (ch == '?') {
2957             ch = skip();
2958             switch (ch) {
2959             case ':':   //  (?:xxx) pure group
2960                 head = createGroup(true);
2961                 tail = root;
2962                 head.next = expr(tail);
2963                 break;
2964             case '=':   // (?=xxx) and (?!xxx) lookahead
2965             case '!':
2966                 head = createGroup(true);
2967                 tail = root;
2968                 head.next = expr(tail);
2969                 if (ch == '=') {
2970                     head = tail = new Pos(head);
2971                 } else {


2980                 break;
2981             case '<':   // (?<xxx)  look behind
2982                 ch = read();
2983                 if (ch != '=' && ch != '!') {
2984                     // named captured group
2985                     String name = groupname(ch);
2986                     if (namedGroups().containsKey(name))
2987                         throw error("Named capturing group <" + name
2988                                     + "> is already defined");
2989                     capturingGroup = true;
2990                     head = createGroup(false);
2991                     tail = root;
2992                     namedGroups().put(name, capturingGroupCount-1);
2993                     head.next = expr(tail);
2994                     break;
2995                 }
2996                 int start = cursor;
2997                 head = createGroup(true);
2998                 tail = root;
2999                 head.next = expr(tail);
3000                 tail.next = lookbehindEnd;
3001                 TreeInfo info = new TreeInfo();
3002                 head.study(info);
3003                 if (info.maxValid == false) {
3004                     throw error("Look-behind group does not have "
3005                                 + "an obvious maximum length");
3006                 }
3007                 boolean hasSupplementary = findSupplementary(start, patternLength);
3008                 if (ch == '=') {
3009                     head = tail = (hasSupplementary ?
3010                                    new BehindS(head, info.maxLength,
3011                                                info.minLength) :
3012                                    new Behind(head, info.maxLength,
3013                                               info.minLength));
3014                 } else { // if (ch == '!')
3015                     head = tail = (hasSupplementary ?
3016                                    new NotBehindS(head, info.maxLength,
3017                                                   info.minLength) :
3018                                    new NotBehind(head, info.maxLength,
3019                                                  info.minLength));
3020                 }


3236             next();
3237             return new Curly(prev, cmin, MAX_REPS, Qtype.LAZY);
3238         } else if (ch == '+') {
3239             next();
3240             return new Curly(prev, cmin, MAX_REPS, Qtype.POSSESSIVE);
3241         }
3242         if (prev instanceof BmpCharProperty) {
3243             return new BmpCharPropertyGreedy((BmpCharProperty)prev, cmin);
3244         } else if (prev instanceof CharProperty) {
3245             return new CharPropertyGreedy((CharProperty)prev, cmin);
3246         }
3247         return new Curly(prev, cmin, MAX_REPS, Qtype.GREEDY);
3248     }
3249 
3250     /**
3251      * Processes repetition. If the next character peeked is a quantifier
3252      * then new nodes must be appended to handle the repetition.
3253      * Prev could be a single or a group, so it could be a chain of nodes.
3254      */
3255     private Node closure(Node prev) {
3256         Node atom;
3257         int ch = peek();
3258         switch (ch) {
3259         case '?':
3260             ch = next();
3261             if (ch == '?') {
3262                 next();
3263                 return new Ques(prev, Qtype.LAZY);
3264             } else if (ch == '+') {
3265                 next();
3266                 return new Ques(prev, Qtype.POSSESSIVE);
3267             }
3268             return new Ques(prev, Qtype.GREEDY);
3269         case '*':
3270             return curly(prev, 0);
3271         case '+':
3272             return curly(prev, 1);
3273         case '{':
3274             ch = skip();
3275             if (ASCII.isDigit(ch)) {
3276                 int cmin = 0, cmax;
3277                 try {
3278                     do {
3279                         cmin = Math.addExact(Math.multiplyExact(cmin, 10),
3280                                              ch - '0');
3281                     } while (ASCII.isDigit(ch = read()));
3282                     cmax = cmin;
3283                     if (ch == ',') {
3284                         ch = read();
3285                         cmax = MAX_REPS;
3286                         if (ch != '}') {


3287                             cmax = 0;
3288                             while (ASCII.isDigit(ch)) {
3289                                 cmax = Math.addExact(Math.multiplyExact(cmax, 10),
3290                                                      ch - '0');
3291                                 ch = read();
3292                             }
3293                         }


3294                     }
3295                 } catch (ArithmeticException ae) {
3296                     throw error("Illegal repetition range");
3297                 }
3298                 if (ch != '}')
3299                     throw error("Unclosed counted closure");
3300                 if (cmax < cmin)
3301                     throw error("Illegal repetition range");
3302                 Curly curly;
3303                 ch = peek();
3304                 if (ch == '?') {
3305                     next();
3306                     curly = new Curly(prev, cmin, cmax, Qtype.LAZY);
3307                 } else if (ch == '+') {
3308                     next();
3309                     curly = new Curly(prev, cmin, cmax, Qtype.POSSESSIVE);
3310                 } else {
3311                     curly = new Curly(prev, cmin, cmax, Qtype.GREEDY);
3312                 }
3313                 return curly;
3314             } else {
3315                 throw error("Illegal repetition");
3316             }
3317         default:
3318             return prev;
3319         }
3320     }
3321 
3322     /**
3323      *  Utility method for parsing control escape sequences.
3324      */
3325     private int c() {
3326         if (cursor < patternLength) {
3327             return read() ^ 64;
3328         }
3329         throw error("Illegal control escape sequence");
3330     }
3331 
3332     /**
3333      *  Utility method for parsing octal escape sequences.


3468 
3469     private static final int countCodePoints(CharSequence seq) {
3470         int length = seq.length();
3471         int n = 0;
3472         for (int i = 0; i < length; ) {
3473             n++;
3474             if (Character.isHighSurrogate(seq.charAt(i++))) {
3475                 if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
3476                     i++;
3477                 }
3478             }
3479         }
3480         return n;
3481     }
3482 
3483     /**
3484      *  Creates a bit vector for matching Latin-1 values. A normal BitClass
3485      *  never matches values above Latin-1, and a complemented BitClass always
3486      *  matches values above Latin-1.
3487      */
3488     static final class BitClass extends BmpCharProperty {
3489         final boolean[] bits;
3490         BitClass() {
3491             this(new boolean[256]);
3492         }
3493         private BitClass(boolean[] bits) {
3494             super( ch -> ch < 256 && bits[ch]);
3495             this.bits = bits;
3496         }
3497         BitClass add(int c, int flags) {
3498             assert c >= 0 && c <= 255;
3499             if ((flags & CASE_INSENSITIVE) != 0) {
3500                 if (ASCII.isAscii(c)) {
3501                     bits[ASCII.toUpper(c)] = true;
3502                     bits[ASCII.toLower(c)] = true;
3503                 } else if ((flags & UNICODE_CASE) != 0) {
3504                     bits[Character.toLowerCase(c)] = true;
3505                     bits[Character.toUpperCase(c)] = true;
3506                 }
3507             }
3508             bits[c] = true;
3509             return this;
3510         }



3511     }
3512 

3513     /**
3514      *  Utility method for creating a string slice matcher.
3515      */
3516     private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
3517         int[] tmp = new int[count];
3518         if (has(CASE_INSENSITIVE)) {
3519             if (has(UNICODE_CASE)) {
3520                 for (int i = 0; i < count; i++) {
3521                     tmp[i] = Character.toLowerCase(
3522                                  Character.toUpperCase(buf[i]));
3523                 }
3524                 return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);
3525             }
3526             for (int i = 0; i < count; i++) {
3527                 tmp[i] = ASCII.toLower(buf[i]);
3528             }
3529             return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);
3530         }
3531         for (int i = 0; i < count; i++) {
3532             tmp[i] = buf[i];


3905                     }
3906                     return next.match(matcher, i, seq);
3907                 }
3908             } else {
3909                 matcher.hitEnd = true;
3910             }
3911             return false;
3912         }
3913         boolean study(TreeInfo info) {
3914             info.minLength++;
3915             info.maxLength += 2;
3916             return next.study(info);
3917         }
3918     }
3919 
3920     /**
3921      * Abstract node class to match one character satisfying some
3922      * boolean property.
3923      */
3924     static class CharProperty extends Node {
3925         CharPredicate predicate;
3926 
3927         CharProperty (CharPredicate predicate) {
3928             this.predicate = predicate;
3929         }
3930         boolean match(Matcher matcher, int i, CharSequence seq) {
3931             if (i < matcher.to) {
3932                 int ch = Character.codePointAt(seq, i);
3933                 return predicate.is(ch) &&
3934                        next.match(matcher, i + Character.charCount(ch), seq);
3935             } else {
3936                 matcher.hitEnd = true;
3937                 return false;
3938             }
3939         }
3940         boolean study(TreeInfo info) {
3941             info.minLength++;
3942             info.maxLength++;
3943             return next.study(info);
3944         }
3945     }


3956             if (i < matcher.to) {
3957                 return predicate.is(seq.charAt(i)) &&
3958                        next.match(matcher, i + 1, seq);
3959             } else {
3960                 matcher.hitEnd = true;
3961                 return false;
3962             }
3963         }
3964     }
3965 
3966     private static class NFCCharProperty extends Node {
3967         CharPredicate predicate;
3968         NFCCharProperty (CharPredicate predicate) {
3969             this.predicate = predicate;
3970         }
3971 
3972         boolean match(Matcher matcher, int i, CharSequence seq) {
3973             if (i < matcher.to) {
3974                 int ch0 = Character.codePointAt(seq, i);
3975                 int n = Character.charCount(ch0);
3976                 int j = Grapheme.nextBoundary(seq, i, matcher.to);









3977                 if (i + n == j) {    // single, assume nfc cp
3978                     if (predicate.is(ch0))
3979                         return next.match(matcher, j, seq);
3980                 } else {
3981                     while (i + n < j) {
3982                         String nfc = Normalizer.normalize(
3983                             seq.toString().substring(i, j), Normalizer.Form.NFC);
3984                         if (nfc.codePointCount(0, nfc.length()) == 1) {
3985                             if (predicate.is(nfc.codePointAt(0)) &&
3986                                 next.match(matcher, j, seq)) {
3987                                 return true;
3988                             }
3989                         }
3990 
3991                         ch0 = Character.codePointBefore(seq, j);
3992                         j -= Character.charCount(ch0);
3993                     }
3994                 }
3995                 if (j < matcher.to)
3996                     return false;


4240                 return next.match(matcher, i, seq);
4241             default:
4242                 return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
4243             }
4244         }
4245         boolean study(TreeInfo info) {
4246             if (type != Qtype.INDEPENDENT) {
4247                 int minL = info.minLength;
4248                 atom.study(info);
4249                 info.minLength = minL;
4250                 info.deterministic = false;
4251                 return next.study(info);
4252             } else {
4253                 atom.study(info);
4254                 return next.study(info);
4255             }
4256         }
4257     }
4258 
4259     /**
4260      * Handles the greedy style repetition with the minimum either be
4261      * 0 or 1 and the maximum be MAX_REPS, for * and + quantifier.
4262      */
4263     static class CharPropertyGreedy extends Node {
4264         final CharPredicate predicate;
4265         final int cmin;
4266 
4267         CharPropertyGreedy(CharProperty cp, int cmin) {
4268             this.predicate = cp.predicate;
4269             this.cmin = cmin;
4270         }
4271         boolean match(Matcher matcher, int i,  CharSequence seq) {
4272             int n = 0;
4273             int to = matcher.to;
4274             // greedy, all the way down
4275             while (i < to) {
4276                 int ch = Character.codePointAt(seq, i);
4277                 if (!predicate.is(ch))
4278                    break;
4279                 i += Character.charCount(ch);
4280                 n++;
4281             }
4282             if (i >= to) {
4283                 matcher.hitEnd = true;
4284             }
4285             while (n >= cmin) {
4286                 if (next.match(matcher, i, seq))
4287                     return true;
4288                 if (n == cmin)
4289                     return false;
4290                  // backing off if match fails
4291                 int ch = Character.codePointBefore(seq, i);


4294             }
4295             return false;
4296         }
4297 
4298         boolean study(TreeInfo info) {
4299             info.minLength += cmin;
4300             if (info.maxValid) {
4301                 info.maxLength += MAX_REPS;
4302             }
4303             info.deterministic = false;
4304             return next.study(info);
4305         }
4306     }
4307 
4308     static final class BmpCharPropertyGreedy extends CharPropertyGreedy {
4309 
4310         BmpCharPropertyGreedy(BmpCharProperty bcp, int cmin) {
4311             super(bcp, cmin);
4312         }
4313 
4314         boolean match(Matcher matcher, int i,  CharSequence seq) {
4315             int n = 0;
4316             int to = matcher.to;
4317             while (i < to && predicate.is(seq.charAt(i))) {
4318                 i++; n++;
4319             }
4320             if (i >= to) {
4321                 matcher.hitEnd = true;
4322             }
4323             while (n >= cmin) {
4324                 if (next.match(matcher, i, seq))
4325                     return true;
4326                 i--; n--;  // backing off if match fails
4327             }
4328             return false;
4329         }
4330     }
4331 
4332     /**
4333      * Handles the curly-brace style repetition with a specified minimum and
4334      * maximum occurrences. The * quantifier is handled as a special case.


4671                 info.maxValid = false;
4672             }
4673 
4674             if (info.deterministic && cmin == cmax) {
4675                 info.deterministic = detm;
4676             } else {
4677                 info.deterministic = false;
4678             }
4679             return next.study(info);
4680         }
4681     }
4682 
4683     /**
4684      * A Guard node at the end of each atom node in a Branch. It
4685      * serves the purpose of chaining the "match" operation to
4686      * "next" but not the "study", so we can collect the TreeInfo
4687      * of each atom node without including the TreeInfo of the
4688      * "next".
4689      */
4690     static final class BranchConn extends Node {
4691         BranchConn() {};
4692         boolean match(Matcher matcher, int i, CharSequence seq) {
4693             return next.match(matcher, i, seq);
4694         }
4695         boolean study(TreeInfo info) {
4696             return info.deterministic;
4697         }
4698     }
4699 
4700     /**
4701      * Handles the branching of alternations. Note this is also used for
4702      * the ? quantifier to branch between the case where it matches once
4703      * and where it does not occur.
4704      */
4705     static final class Branch extends Node {
4706         Node[] atoms = new Node[2];
4707         int size = 2;
4708         Node conn;
4709         Branch(Node first, Node second, Node branchConn) {
4710             conn = branchConn;
4711             atoms[0] = first;


4768      * and restores them when the match is done.
4769      *
4770      * The matchRef is used when a reference to this group is accessed later
4771      * in the expression. The locals will have a negative value in them to
4772      * indicate that we do not want to unset the group if the reference
4773      * doesn't match.
4774      */
4775     static final class GroupHead extends Node {
4776         int localIndex;
4777         GroupTail tail;    // for debug/print only, match does not need to know
4778         GroupHead(int localCount) {
4779             localIndex = localCount;
4780         }
4781         boolean match(Matcher matcher, int i, CharSequence seq) {
4782             int save = matcher.locals[localIndex];
4783             matcher.locals[localIndex] = i;
4784             boolean ret = next.match(matcher, i, seq);
4785             matcher.locals[localIndex] = save;
4786             return ret;
4787         }
4788         boolean matchRef(Matcher matcher, int i, CharSequence seq) {
4789             int save = matcher.locals[localIndex];
4790             matcher.locals[localIndex] = ~i; // HACK
4791             boolean ret = next.match(matcher, i, seq);
4792             matcher.locals[localIndex] = save;
4793             return ret;
4794         }
4795     }
4796 
4797     /**
4798      * Recursive reference to a group in the regular expression. It calls
4799      * matchRef because if the reference fails to match we would not unset
4800      * the group.
4801      */
4802     static final class GroupRef extends Node {
4803         GroupHead head;
4804         GroupRef(GroupHead head) {
4805             this.head = head;
4806         }
4807         boolean match(Matcher matcher, int i, CharSequence seq) {
4808             return head.matchRef(matcher, i, seq)
4809                 && next.match(matcher, matcher.last, seq);
4810         }
4811         boolean study(TreeInfo info) {
4812             info.maxValid = false;
4813             info.deterministic = false;
4814             return next.study(info);
4815         }
4816     }
4817 
4818     /**
4819      * The GroupTail handles the setting of group beginning and ending
4820      * locations when groups are successfully matched. It must also be able to
4821      * unset groups that have to be backed off of.
4822      *
4823      * The GroupTail node is also used when a previous group is referenced,
4824      * and in that case no group information needs to be set.
4825      */
4826     static final class GroupTail extends Node {
4827         int localIndex;
4828         int groupIndex;
4829         GroupTail(int localCount, int groupCount) {
4830             localIndex = localCount;
4831             groupIndex = groupCount + groupCount;
4832         }
4833         boolean match(Matcher matcher, int i, CharSequence seq) {
4834             int tmp = matcher.locals[localIndex];
4835             if (tmp >= 0) { // This is the normal group case.


4917                         matcher.localsPos[posIndex].contains(i)) {
4918                         return next.match(matcher, i, seq);
4919                     }
4920                     matcher.locals[countIndex] = count + 1;
4921                     boolean b = body.match(matcher, i, seq);
4922                     // If match failed we must backtrack, so
4923                     // the loop count should NOT be incremented
4924                     if (b)
4925                         return true;
4926                     matcher.locals[countIndex] = count;
4927                     // save the failed position
4928                     if (posIndex != -1) {
4929                         matcher.localsPos[posIndex].add(i);
4930                     }
4931                 }
4932             }
4933             return next.match(matcher, i, seq);
4934         }
4935         boolean matchInit(Matcher matcher, int i, CharSequence seq) {
4936             int save = matcher.locals[countIndex];
4937             boolean ret = false;
4938             if (posIndex != -1 && matcher.localsPos[posIndex] == null) {
4939                 matcher.localsPos[posIndex] = new IntHashSet();
4940             }
4941             if (0 < cmin) {
4942                 matcher.locals[countIndex] = 1;
4943                 ret = body.match(matcher, i, seq);
4944             } else if (0 < cmax) {
4945                 matcher.locals[countIndex] = 1;
4946                 ret = body.match(matcher, i, seq);
4947                 if (ret == false)
4948                     ret = next.match(matcher, i, seq);
4949             } else {
4950                 ret = next.match(matcher, i, seq);
4951             }
4952             matcher.locals[countIndex] = save;
4953             return ret;
4954         }
4955         boolean study(TreeInfo info) {
4956             info.maxValid = false;
4957             info.deterministic = false;


5131             for (;;) {
5132                 if (i > matcher.to) {
5133                     matcher.hitEnd = true;
5134                     return false;
5135                 }
5136                 if (atom.match(matcher, i, seq)) {
5137                     return next.match(matcher, matcher.last, seq);
5138                 }
5139                 i += countChars(seq, i, 1);
5140                 matcher.first++;
5141             }
5142         }
5143         boolean study(TreeInfo info) {
5144             atom.study(info);
5145             info.maxValid = false;
5146             info.deterministic = false;
5147             return next.study(info);
5148         }
5149     }
5150 
5151     static final class Conditional extends Node {
5152         Node cond, yes, not;
5153         Conditional(Node cond, Node yes, Node not) {
5154             this.cond = cond;
5155             this.yes = yes;
5156             this.not = not;
5157         }
5158         boolean match(Matcher matcher, int i, CharSequence seq) {
5159             if (cond.match(matcher, i, seq)) {
5160                 return yes.match(matcher, i, seq);
5161             } else {
5162                 return not.match(matcher, i, seq);
5163             }
5164         }
5165         boolean study(TreeInfo info) {
5166             int minL = info.minLength;
5167             int maxL = info.maxLength;
5168             boolean maxV = info.maxValid;
5169             info.reset();
5170             yes.study(info);
5171 
5172             int minL2 = info.minLength;
5173             int maxL2 = info.maxLength;
5174             boolean maxV2 = info.maxValid;
5175             info.reset();
5176             not.study(info);
5177 
5178             info.minLength = minL + Math.min(minL2, info.minLength);
5179             info.maxLength = maxL + Math.max(maxL2, info.maxLength);
5180             info.maxValid = (maxV & maxV2 & info.maxValid);
5181             info.deterministic = false;
5182             return next.study(info);
5183         }
5184     }
5185 
5186     /**
5187      * Zero width positive lookahead.
5188      */
5189     static final class Pos extends Node {
5190         Node cond;
5191         Pos(Node cond) {
5192             this.cond = cond;
5193         }
5194         boolean match(Matcher matcher, int i, CharSequence seq) {
5195             int savedTo = matcher.to;
5196             boolean conditionMatched = false;
5197 
5198             // Relax transparent region boundaries for lookahead
5199             if (matcher.transparentBounds)
5200                 matcher.to = matcher.getTextLength();
5201             try {
5202                 conditionMatched = cond.match(matcher, i, seq);
5203             } finally {
5204                 // Reinstate region boundaries
5205                 matcher.to = savedTo;
5206             }
5207             return conditionMatched && next.match(matcher, i, seq);
5208         }
5209     }
5210 
5211     /**
5212      * Zero width negative lookahead.
5213      */
5214     static final class Neg extends Node {
5215         Node cond;
5216         Neg(Node cond) {
5217             this.cond = cond;
5218         }
5219         boolean match(Matcher matcher, int i, CharSequence seq) {
5220             int savedTo = matcher.to;
5221             boolean conditionMatched = false;
5222 
5223             // Relax transparent region boundaries for lookahead
5224             if (matcher.transparentBounds)
5225                 matcher.to = matcher.getTextLength();
5226             try {
5227                 if (i < matcher.to) {
5228                     conditionMatched = !cond.match(matcher, i, seq);
5229                 } else {
5230                     // If a negative lookahead succeeds then more input
5231                     // could cause it to fail!
5232                     matcher.requireEnd = true;
5233                     conditionMatched = !cond.match(matcher, i, seq);
5234                 }
5235             } finally {
5236                 // Reinstate region boundaries
5237                 matcher.to = savedTo;
5238             }
5239             return conditionMatched && next.match(matcher, i, seq);
5240         }
5241     }
5242 
5243     /**
5244      * For use with lookbehinds; matches the position where the lookbehind
5245      * was encountered.
5246      */
5247     static Node lookbehindEnd = new Node() {




5248         boolean match(Matcher matcher, int i, CharSequence seq) {
5249             return i == matcher.lookbehindTo;
5250         }
5251     };
5252 
5253     /**
5254      * Zero width positive lookbehind.
5255      */
5256     static class Behind extends Node {
5257         Node cond;
5258         int rmax, rmin;
5259         Behind(Node cond, int rmax, int rmin) {
5260             this.cond = cond;
5261             this.rmax = rmax;
5262             this.rmin = rmin;
5263         }
5264 
5265         boolean match(Matcher matcher, int i, CharSequence seq) {
5266             int savedFrom = matcher.from;
5267             boolean conditionMatched = false;
5268             int startIndex = (!matcher.transparentBounds) ?
5269                              matcher.from : 0;
5270             int from = Math.max(i - rmax, startIndex);
5271             // Set end boundary


5499          * Pre calculates arrays needed to generate the bad character
5500          * shift and the good suffix shift. Only the last seven bits
5501          * are used to see if chars match; This keeps the tables small
5502          * and covers the heavily used ASCII range, but occasionally
5503          * results in an aliased match for the bad character shift.
5504          */
5505         static Node optimize(Node node) {
5506             if (!(node instanceof Slice)) {
5507                 return node;
5508             }
5509 
5510             int[] src = ((Slice) node).buffer;
5511             int patternLength = src.length;
5512             // The BM algorithm requires a bit of overhead;
5513             // If the pattern is short don't use it, since
5514             // a shift larger than the pattern length cannot
5515             // be used anyway.
5516             if (patternLength < 4) {
5517                 return node;
5518             }
5519             int i, j, k;
5520             int[] lastOcc = new int[128];
5521             int[] optoSft = new int[patternLength];
5522             // Precalculate part of the bad character shift
5523             // It is a table for where in the pattern each
5524             // lower 7-bit value occurs
5525             for (i = 0; i < patternLength; i++) {
5526                 lastOcc[src[i]&0x7F] = i + 1;
5527             }
5528             // Precalculate the good suffix shift
5529             // i is the shift amount being considered
5530 NEXT:       for (i = patternLength; i > 0; i--) {
5531                 // j is the beginning index of suffix being considered
5532                 for (j = patternLength - 1; j >= i; j--) {
5533                     // Testing for good suffix
5534                     if (src[j] == src[j-i]) {
5535                         // src[j..len] is a good suffix
5536                         optoSft[j-1] = i;
5537                     } else {
5538                         // No match. The array has already been
5539                         // filled up with correct values before.


5654         boolean is(int ch);
5655 
5656         default CharPredicate and(CharPredicate p) {
5657             return ch -> is(ch) && p.is(ch);
5658         }
5659         default CharPredicate union(CharPredicate p) {
5660             return ch -> is(ch) || p.is(ch);
5661         }
5662         default CharPredicate union(CharPredicate p1,
5663                                     CharPredicate p2 ) {
5664             return ch -> is(ch) || p1.is(ch) || p2.is(ch);
5665         }
5666         default CharPredicate negate() {
5667             return ch -> !is(ch);
5668         }
5669     }
5670 
5671     static interface BmpCharPredicate extends CharPredicate {
5672 
5673         default CharPredicate and(CharPredicate p) {
5674             if(p instanceof BmpCharPredicate)
5675                 return (BmpCharPredicate)(ch -> is(ch) && p.is(ch));
5676             return ch -> is(ch) && p.is(ch);
5677         }
5678         default CharPredicate union(CharPredicate p) {
5679             if (p instanceof BmpCharPredicate)
5680                 return (BmpCharPredicate)(ch -> is(ch) || p.is(ch));
5681             return ch -> is(ch) || p.is(ch);
5682         }
5683         static CharPredicate union(CharPredicate... predicates) {
5684             CharPredicate cp = ch -> {
5685                 for (CharPredicate p : predicates) {
5686                     if (!p.is(ch))
5687                         return false;
5688                 }
5689                 return true;
5690             };
5691             for (CharPredicate p : predicates) {
5692                 if (! (p instanceof BmpCharPredicate))
5693                     return cp;
5694             }


< prev index next >