Source for gnu.java.util.regex.REMatch

   1: /* gnu/regexp/REMatch.java
   2:    Copyright (C) 2006 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.java.util.regex;
  40: import java.io.Serializable;
  41: 
  42: /**
  43:  * An instance of this class represents a match
  44:  * completed by a gnu.regexp matching function. It can be used
  45:  * to obtain relevant information about the location of a match
  46:  * or submatch.
  47:  *
  48:  * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
  49:  */
  50: public final class REMatch implements Serializable, Cloneable {
  51:     private String matchedText;
  52:     private CharIndexed matchedCharIndexed;
  53: 
  54:     // These variables are package scope for fast access within the engine
  55:     int eflags; // execution flags this match was made using
  56: 
  57:     // Offset in source text where match was tried.  This is zero-based;
  58:     // the actual position in the source text is given by (offset + anchor).
  59:     int offset;
  60: 
  61:     // Anchor position refers to the index into the source input
  62:     // at which the matching operation began.
  63:     // This is also useful for the ANCHORINDEX option.
  64:     int anchor;
  65: 
  66:     // Package scope; used by RE.
  67:     int index; // used while matching to mark current match position in input
  68:     // start1[i] is set when the i-th subexp starts. And start1[i] is copied
  69:     // to start[i] when the i-th subexp ends.  So start[i] keeps the previously
  70:     // assigned value while the i-th subexp is being processed. This makes
  71:     // backreference to the i-th subexp within the i-th subexp possible.
  72:     int[] start; // start positions (relative to offset) for each (sub)exp.
  73:     int[] start1; // start positions (relative to offset) for each (sub)exp.
  74:     int[] end;   // end positions for the same
  75:     // start[i] == -1 or end[i] == -1 means that the start/end position is void.
  76:     // start[i] == p or end[i] == p where p < 0 and p != -1 means that
  77:     // the actual start/end position is (p+1). Start/end positions may
  78:     // become negative when the subexpression is in a RETokenLookBehind.
  79:     boolean empty; // empty string matched. This flag is used only within
  80:            // RETokenRepeated.
  81: 
  82:     BacktrackStack backtrackStack;
  83: 
  84:     public Object clone() {
  85:     try {
  86:         REMatch copy = (REMatch) super.clone();
  87: 
  88:         copy.start = (int[]) start.clone();
  89:         copy.start1 = (int[]) start1.clone();
  90:         copy.end = (int[]) end.clone();
  91: 
  92:         return copy;
  93:     } catch (CloneNotSupportedException e) {
  94:         throw new Error(); // doesn't happen
  95:     }
  96:     }
  97: 
  98:     void assignFrom(REMatch other) {
  99:     start = other.start;
 100:     start1 = other.start1;
 101:     end = other.end;
 102:     index = other.index;
 103:     backtrackStack = other.backtrackStack;
 104:     }
 105: 
 106:     REMatch(int subs, int anchor, int eflags) {
 107:     start = new int[subs+1];
 108:     start1 = new int[subs+1];
 109:     end = new int[subs+1];
 110:     this.anchor = anchor;
 111:     this.eflags = eflags;
 112:     clear(anchor);
 113:     }
 114: 
 115:     void finish(CharIndexed text) {
 116:     start[0] = 0;
 117:     StringBuffer sb = new StringBuffer();
 118:     int i;
 119:     for (i = 0; i < end[0]; i++)
 120:         sb.append(text.charAt(i));
 121:     matchedText = sb.toString();
 122:     matchedCharIndexed = text;
 123:     for (i = 0; i < start.length; i++) {
 124:         // If any subexpressions didn't terminate, they don't count
 125:         // TODO check if this code ever gets hit
 126:         if ((start[i] == -1) ^ (end[i] == -1)) {
 127:         start[i] = -1;
 128:         end[i] = -1;
 129:         }
 130:     }
 131:     backtrackStack = null;
 132:     }
 133:     
 134:     /** Clears the current match and moves the offset to the new index. */
 135:     void clear(int index) {
 136:     offset = index;
 137:     this.index = 0;
 138:     for (int i = 0; i < start.length; i++) {
 139:         start[i] = start1[i] = end[i] = -1;
 140:     }
 141:     backtrackStack = null;
 142:     }
 143:     
 144:     /**
 145:      * Returns the string matching the pattern.  This makes it convenient
 146:      * to write code like the following:
 147:      * <P>
 148:      * <code> 
 149:      * REMatch myMatch = myExpression.getMatch(myString);<br>
 150:      * if (myMatch != null) System.out.println("Regexp found: "+myMatch);
 151:      * </code>
 152:      */
 153:     public String toString() {
 154:     return matchedText;
 155:     }
 156:     
 157:     /**
 158:      * Returns the index within the input text where the match in its entirety
 159:      * began.
 160:      */
 161:     public int getStartIndex() {
 162:     return offset + start[0];
 163:     }
 164:     
 165:     /**
 166:      * Returns the index within the input string where the match in
 167:      * its entirety ends.  The return value is the next position after
 168:      * the end of the string; therefore, a match created by the
 169:      * following call:
 170:      *
 171:      * <P>
 172:      * <code>REMatch myMatch = myExpression.getMatch(myString);</code>
 173:      * <P>
 174:      * can be viewed (given that myMatch is not null) by creating
 175:      * <P>
 176:      * <code>String theMatch = myString.substring(myMatch.getStartIndex(),
 177:      * myMatch.getEndIndex());</code>
 178:      * <P>
 179:      * But you can save yourself that work, since the <code>toString()</code>
 180:      * method (above) does exactly that for you.  
 181:      */
 182:     public int getEndIndex() {
 183:     return offset + end[0];
 184:     }
 185:   
 186:     /**
 187:      * Returns the string matching the given subexpression.  The subexpressions
 188:      * are indexed starting with one, not zero.  That is, the subexpression
 189:      * identified by the first set of parentheses in a regular expression
 190:      * could be retrieved from an REMatch by calling match.toString(1).
 191:      *
 192:      * @param sub Index of the subexpression.
 193:      */
 194:     public String toString(int sub) {
 195:     if ((sub >= start.length) || sub < 0)
 196:         throw new IndexOutOfBoundsException("No group " + sub);
 197:     if (start[sub] == -1) return null;
 198:     if (start[sub] >= 0 && end[sub] <= matchedText.length())
 199:         return (matchedText.substring(start[sub],end[sub]));
 200:     else {
 201:     // This case occurs with RETokenLookAhead or RETokenLookBehind.
 202:         StringBuffer sb = new StringBuffer();
 203:         int s = start[sub];
 204:         int e = end[sub];
 205:         if (s < 0) s += 1;
 206:         if (e < 0) e += 1;
 207:         for (int i = start[0] + s; i < start[0] + e; i++)
 208:             sb.append(matchedCharIndexed.charAt(i));
 209:         return sb.toString();
 210:     }
 211:     }
 212:     
 213:     /** 
 214:      * Returns the index within the input string used to generate this match
 215:      * where subexpression number <i>sub</i> begins, or <code>-1</code> if
 216:      * the subexpression does not exist.  The initial position is zero.
 217:      *
 218:      * @param sub Subexpression index
 219:      * @deprecated Use getStartIndex(int) instead.
 220:      */
 221:     public int getSubStartIndex(int sub) {
 222:     if (sub >= start.length) return -1;
 223:     int x = start[sub];
 224:     return (x == -1) ? x :
 225:            (x >= 0) ? offset + x : offset + x + 1;
 226:     }
 227:     
 228:     /** 
 229:      * Returns the index within the input string used to generate this match
 230:      * where subexpression number <i>sub</i> begins, or <code>-1</code> if
 231:      * the subexpression does not exist.  The initial position is zero.
 232:      *
 233:      * @param sub Subexpression index
 234:      * @since gnu.regexp 1.1.0
 235:      */
 236:     public int getStartIndex(int sub) {
 237:     if (sub >= start.length) return -1;
 238:     int x = start[sub];
 239:     return (x == -1) ? x :
 240:            (x >= 0) ? offset + x : offset + x + 1;
 241:     }
 242:   
 243:     /** 
 244:      * Returns the index within the input string used to generate this match
 245:      * where subexpression number <i>sub</i> ends, or <code>-1</code> if
 246:      * the subexpression does not exist.  The initial position is zero.
 247:      *
 248:      * @param sub Subexpression index
 249:      * @deprecated Use getEndIndex(int) instead
 250:      */
 251:     public int getSubEndIndex(int sub) {
 252:     if (sub >= start.length) return -1;
 253:     int x = end[sub];
 254:     return (x == -1) ? x :
 255:            (x >= 0) ? offset + x : offset + x + 1;
 256:     }
 257:     
 258:     /** 
 259:      * Returns the index within the input string used to generate this match
 260:      * where subexpression number <i>sub</i> ends, or <code>-1</code> if
 261:      * the subexpression does not exist.  The initial position is zero.
 262:      *
 263:      * @param sub Subexpression index
 264:      */
 265:     public int getEndIndex(int sub) {
 266:     if (sub >= start.length) return -1;
 267:     int x = end[sub];
 268:     return (x == -1) ? x :
 269:            (x >= 0) ? offset + x : offset + x + 1;
 270:     }
 271:     
 272:     /**
 273:      * Substitute the results of this match to create a new string.
 274:      * This is patterned after PERL, so the tokens to watch out for are
 275:      * <code>$0</code> through <code>$9</code>.  <code>$0</code> matches
 276:      * the full substring matched; <code>$<i>n</i></code> matches
 277:      * subexpression number <i>n</i>.
 278:      * <code>$10, $11, ...</code> may match the 10th, 11th, ... subexpressions
 279:      * if such subexpressions exist.
 280:      *
 281:      * @param input A string consisting of literals and <code>$<i>n</i></code> tokens.
 282:      */
 283:     public String substituteInto(String input) {
 284:     // a la Perl, $0 is whole thing, $1 - $9 are subexpressions
 285:     StringBuffer output = new StringBuffer();
 286:     int pos;
 287:     for (pos = 0; pos < input.length()-1; pos++) {
 288:         if ((input.charAt(pos) == '$') && (Character.isDigit(input.charAt(pos+1)))) {
 289:         int val = Character.digit(input.charAt(++pos),10);
 290:         int pos1 = pos + 1;
 291:         while (pos1 < input.length() &&
 292:                Character.isDigit(input.charAt(pos1))) {
 293:             int val1 = val*10 + Character.digit(input.charAt(pos1),10);
 294:             if (val1 >= start.length) break;
 295:             pos1++;
 296:             val = val1;
 297:         }
 298:         pos = pos1 - 1;
 299: 
 300:         if (val < start.length) {
 301:             output.append(toString(val));
 302:         } 
 303:         } else output.append(input.charAt(pos));
 304:     }
 305:     if (pos < input.length()) output.append(input.charAt(pos));
 306:     return output.toString();
 307:     }
 308: 
 309: /*  The following are used for debugging purpose
 310:     public static String d(REMatch m) {
 311:     if (m == null) return "null";
 312:         else return "[" + m.index + "]";
 313:     }
 314: 
 315:     public String substringUptoIndex(CharIndexed input) {
 316:     StringBuffer sb = new StringBuffer();
 317:     for (int i = 0; i < index; i++) {
 318:         sb.append(input.charAt(i));
 319:     }
 320:     return sb.toString();
 321:     }
 322: */
 323: 
 324: }