Source for gnu.javax.swing.text.html.parser.htmlValidator

   1: /* tagStack.java -- The HTML tag stack.
   2:    Copyright (C) 2005 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.javax.swing.text.html.parser;
  40: 
  41: import gnu.javax.swing.text.html.parser.models.node;
  42: import gnu.javax.swing.text.html.parser.models.transformer;
  43: 
  44: import java.util.BitSet;
  45: import java.util.Enumeration;
  46: import java.util.LinkedList;
  47: import java.util.ListIterator;
  48: 
  49: import javax.swing.text.SimpleAttributeSet;
  50: import javax.swing.text.html.HTML;
  51: import javax.swing.text.html.parser.*;
  52: 
  53: /**
  54:  * <p>The HTML content validator, is responsible for opening and
  55:  * closing elements with optional start/end tags, detecting
  56:  * the wrongly placed html tags and reporting errors. The working instance
  57:  * is the inner class inside the {@link javax.swing.text.html.parser.Parser }
  58:  * </p>
  59:  * <p>This class could potentially
  60:  * provide basis for automated closing and insertion of the html tags,
  61:  * correcting the found html errors.
  62:  * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
  63:  */
  64: public abstract class htmlValidator
  65: {
  66:   /**
  67:    * The tag reference, holding additional information that the tag
  68:    * has been forcibly closed.
  69:    */
  70:   protected class hTag
  71:   {
  72:     protected final Element element;
  73:     protected final HTML.Tag tag;
  74:     protected final TagElement tgElement;
  75:     protected boolean forcibly_closed;
  76:     protected node validationTrace;
  77: 
  78:     protected hTag(TagElement an_element)
  79:     {
  80:       element = an_element.getElement();
  81:       tag = an_element.getHTMLTag();
  82:       tgElement = an_element;
  83: 
  84:       if (element.content != null)
  85:         validationTrace = transformer.transform(element.content, dtd);
  86:     }
  87: 
  88:     /**
  89:      * This is called when the tag must be forcibly closed because
  90:      * it would make the newly appearing tag invalid.
  91:      * The parser is not notified about such event (just the error
  92:      * is reported). For such tags, the closing message does not
  93:      * appear when later reaching the end of stream. The exception is
  94:      * the &lt;head&gt; tag: the parser is notified about its silent closing
  95:      * when &lt;body&gt; or other html content appears.
  96:      */
  97:     protected void forciblyCloseDueContext()
  98:     {
  99:       forcibly_closed = true;
 100:     }
 101: 
 102:     /**
 103:      * This is called when the tag must be forcibly closed after
 104:      * reaching the end of stream. The parser is notified as if
 105:      * closing the tag explicitly.
 106:      */
 107:     protected void forciblyCloseDueEndOfStream()
 108:     {
 109:       forcibly_closed = true;
 110:       handleSupposedEndTag(element);
 111:     }
 112:   }
 113: 
 114:   /**
 115:    * The DTD, providing information about the valid document structure.
 116:    */
 117:   protected final DTD dtd;
 118: 
 119:   /**
 120:   * The stack, holding the current tag context.
 121:   */
 122:   protected final LinkedList stack = new LinkedList();
 123: 
 124:   /**
 125:    * Creates a new tag stack, using the given DTD.
 126:    * @param a_dtd A DTD, providing the information about the valid
 127:    * tag content.
 128:    */
 129:   public htmlValidator(DTD a_dtd)
 130:   {
 131:     dtd = a_dtd;
 132:   }
 133: 
 134:   /**
 135:    * Close all opened tags (called at the end of parsing).
 136:    */
 137:   public void closeAll()
 138:   {
 139:     hTag h;
 140:     while (!stack.isEmpty())
 141:       {
 142:         h = (hTag) stack.getLast();
 143:         if (!h.forcibly_closed && !h.element.omitEnd())
 144:           s_error("Unclosed <" + h.tag + ">, closing at the end of stream");
 145: 
 146:         handleSupposedEndTag(h.element);
 147: 
 148:         closeTag(h.tgElement);
 149:       }
 150:   }
 151: 
 152:   /**
 153:    * Remove the given tag from the stack or (if found) from the list
 154:    * of the forcibly closed tags.
 155:    */
 156:   public boolean closeTag(TagElement tElement)
 157:   {
 158:     HTML.Tag tag = tElement.getHTMLTag();
 159:     hTag x;
 160:     hTag close;
 161: 
 162:     if (!stack.isEmpty())
 163:       {
 164:         ListIterator iter = stack.listIterator(stack.size());
 165: 
 166:         while (iter.hasPrevious())
 167:           {
 168:             x = (hTag) iter.previous();
 169:             if (tag.equals(x.tag))
 170:               {
 171:                 if (x.forcibly_closed && !x.element.omitEnd())
 172:                   s_error("The tag <" + x.tag +
 173:                           "> has already been forcibly closed"
 174:                          );
 175: 
 176: 
 177:                 // If the tag has a content model defined, forcibly close all
 178:                 // tags that were opened after the tag being currently closed.
 179:                 closing: 
 180:                 if (x.element.content != null)
 181:                   {
 182:                     iter = stack.listIterator(stack.size());
 183:                     while (iter.hasPrevious())
 184:                       {
 185:                         close = (hTag) iter.previous();
 186:                         if (close == x)
 187:                           break closing;
 188:                         handleSupposedEndTag(close.element);
 189:                         iter.remove();
 190:                       }
 191:                   }
 192: 
 193:                 stack.remove(x);
 194:                 return true;
 195:               }
 196:           }
 197:       }
 198:     s_error("Closing unopened <" + tag + ">");
 199:     return false;
 200:   }
 201: 
 202:   /**
 203:    * Add the given HTML tag to the stack of the opened tags. Forcibly closes
 204:    * all tags in the stack that does not allow this tag in they content (error
 205:    * is reported).
 206:    * @param element
 207:    */
 208:   public void openTag(TagElement tElement, htmlAttributeSet parameters)
 209:   {
 210:     // If this is a fictional call, the message from the parser
 211:     // has recursively returned - ignore.
 212:     if (tElement.fictional())
 213:       return;
 214: 
 215:     validateParameters(tElement, parameters);
 216: 
 217:     // If the stack is empty, start from HTML
 218:     if (stack.isEmpty() && tElement.getHTMLTag() != HTML.Tag.HTML)
 219:       {
 220:         Element html = dtd.getElement(HTML.Tag.HTML.toString());
 221:         openFictionalTag(html);
 222:       }
 223: 
 224:     Object v = tagIsValidForContext(tElement);
 225:     if (v != Boolean.TRUE)
 226:       {
 227:         // The tag is not valid for context, the content
 228:         // model suggest to open another tag.
 229:         if (v instanceof Element)
 230:           {
 231:             int n = 0;
 232:             while (v instanceof Element && (n++ < 100))
 233:               {
 234:                 Element fe = (Element) v;
 235: 
 236:                 // notify the content model that we add the proposed tag
 237:                 node ccm = getCurrentContentModel();
 238:                 if (ccm != null)
 239:                   ccm.show(fe);
 240:                 openFictionalTag(fe);
 241: 
 242:                 Object vv = tagIsValidForContext(tElement);
 243:                 if (vv instanceof Element) // One level of nesting is supported.
 244:                   {
 245:                     openFictionalTag((Element) vv);
 246: 
 247:                     Object vx = tagIsValidForContext(tElement);
 248:                     if (vx instanceof Element)
 249:                       openFictionalTag((Element) vx);
 250:                   }
 251:                 else if (vv == Boolean.FALSE)
 252:                   {
 253:                     // The tag is still not valid for the current
 254:                     // content after opening a fictional element.
 255:                     if (fe.omitEnd())
 256:                       {
 257:                         // close the previously opened fictional tag.
 258:                         closeLast();
 259:                         vv = tagIsValidForContext(tElement);
 260:                         if (vv instanceof Element)
 261: 
 262:                           // another tag was suggested by the content model
 263:                           openFictionalTag((Element) vv);
 264:                       }
 265:                   }
 266:                 v = tagIsValidForContext(tElement);
 267:               }
 268:           }
 269:         else // If the current element has the optional end tag, close it.
 270:           {
 271:             if (!stack.isEmpty())
 272:               {
 273:                 closing: 
 274:                 do
 275:                   {
 276:                     hTag last = (hTag) stack.getLast();
 277:                     if (last.element.omitEnd())
 278:                       {
 279:                         closeLast();
 280:                         v = tagIsValidForContext(tElement);
 281:                         if (v instanceof Element) // another tag was suggested by the content model
 282:                           {
 283:                             openFictionalTag((Element) v);
 284:                             break closing;
 285:                           }
 286:                       }
 287:                     else
 288:                       break closing;
 289:                   }
 290:                 while (v == Boolean.FALSE && !stack.isEmpty());
 291:               }
 292:           }
 293:       }
 294: 
 295:     stack.add(new hTag(tElement));
 296:   }
 297: 
 298:   /**
 299:    * Clear the stack.
 300:    */
 301:   public void restart()
 302:   {
 303:     stack.clear();
 304:   }
 305: 
 306:   /**
 307:    * Check if this tag is valid for the current context. Return Boolean.True if
 308:    * it is OK, Boolean.False if it is surely not OK or the Element that the
 309:    * content model recommends to insert making the situation ok. If Boolean.True
 310:    * is returned, the content model current position is moved forward. Otherwise
 311:    * this position remains the same.
 312:    * 
 313:    * @param tElement
 314:    * @return
 315:    */
 316:   public Object tagIsValidForContext(TagElement tElement)
 317:   {
 318:     // Check the current content model, if one is available.
 319:     node cv = getCurrentContentModel();
 320: 
 321:     if (cv != null)
 322:       return cv.show(tElement.getElement());
 323: 
 324:     // Check exclusions and inclusions.
 325:     ListIterator iter = stack.listIterator(stack.size());
 326:     hTag t = null;
 327:     final int idx = tElement.getElement().index;
 328: 
 329:     // Check only known tags.
 330:     if (idx >= 0)
 331:       {
 332:         BitSet inclusions = new BitSet();
 333:         while (iter.hasPrevious())
 334:           {
 335:             t = (hTag) iter.previous();
 336:             if (! t.forcibly_closed)
 337:               {
 338:                 if (t.element.exclusions != null
 339:                     && t.element.exclusions.get(idx))
 340:                   return Boolean.FALSE;
 341: 
 342:                 if (t.element.inclusions != null)
 343:                   inclusions.or(t.element.inclusions);
 344:               }
 345:           }
 346:         if (! inclusions.get(idx))
 347:           {
 348:             // If we need to insert something, and cannot do this, but
 349:             // it is allowed to insert the paragraph here, insert the
 350:             // paragraph.
 351:             Element P = dtd.getElement(HTML_401F.P);
 352:             if (inclusions.get(P.index))
 353:               return P;
 354:             else
 355:               return Boolean.FALSE;
 356:           }
 357:       }
 358:     return Boolean.TRUE;
 359:   }
 360: 
 361:   /**
 362:    * Validate tag without storing in into the tag stack. This is called
 363:    * for the empty tags and results the subsequent calls to the openTag
 364:    * and closeTag.
 365:    */
 366:   public void validateTag(TagElement tElement, htmlAttributeSet parameters)
 367:   {
 368:     openTag(tElement, parameters);
 369:     closeTag(tElement);
 370:   }
 371: 
 372:   /**
 373:    * Check for mandatory elements, subsequent to the last tag:
 374:    * @param tElement The element that will be inserted next.
 375:    */
 376:   protected void checkContentModel(TagElement tElement, boolean first)
 377:   {
 378:     if (stack.isEmpty())
 379:       return;
 380: 
 381:     hTag last = (hTag) stack.getLast();
 382:     if (last.validationTrace == null)
 383:       return;
 384: 
 385:     Object r = last.validationTrace.show(tElement.getElement());
 386:     if (r == Boolean.FALSE)
 387:       s_error("The <" + last.element + "> does not match the content model " +
 388:               last.validationTrace
 389:              );
 390:     else if (r instanceof Element) // The content model recommends insertion of this element
 391:       {
 392:         if (!first)
 393:           closeTag(last.tgElement);
 394:         handleSupposedStartTag((Element) r);
 395:         openTag(new TagElement((Element) r), null);
 396:       }
 397:   }
 398: 
 399:   /**
 400:    * The method is called when the tag must be closed because
 401:    * it does not allow the subsequent elements inside its context
 402:    * or the end of stream has been reached. The parser is only
 403:    * informed if the element being closed does not require the
 404:    * end tag (the "omitEnd" flag is set).
 405:    * The closing message must be passed to the parser mechanism
 406:    * before passing message about the opening the next tag.
 407:    *
 408:    * @param element The tag being fictionally (forcibly) closed.
 409:    */
 410:   protected abstract void handleSupposedEndTag(Element element);
 411: 
 412:   /**
 413:    * The method is called when the validator decides to open the
 414:    * tag on its own initiative. This may happen if the content model
 415:    * includes the element with the optional (supposed) start tag.
 416:    *
 417:    * @param element The tag being opened.
 418:    */
 419:   protected abstract void handleSupposedStartTag(Element element);
 420: 
 421:   /**
 422:    * Handles the error message. This method must be overridden to pass
 423:    * the message where required.
 424:    * @param msg The message text.
 425:    */
 426:   protected abstract void s_error(String msg);
 427: 
 428:   /**
 429:    * Validate the parameters, report the error if the given parameter is
 430:    * not in the parameter set, valid for the given attribute. The information
 431:    * about the valid parameter set is taken from the Element, enclosed
 432:    * inside the tag. The method does not validate the default parameters.
 433:    * @param tag The tag
 434:    * @param parameters The parameters of this tag.
 435:    */
 436:   protected void validateParameters(TagElement tag, htmlAttributeSet parameters)
 437:   {
 438:     if (parameters == null ||
 439:         parameters == htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET ||
 440:         parameters == SimpleAttributeSet.EMPTY
 441:        )
 442:       return;
 443: 
 444:     Enumeration enumeration = parameters.getAttributeNames();
 445: 
 446:     while (enumeration.hasMoreElements())
 447:       {
 448:         validateAttribute(tag, parameters, enumeration);
 449:       }
 450: 
 451:     // Check for missing required values.
 452:     AttributeList a = tag.getElement().getAttributes();
 453: 
 454:     while (a != null)
 455:       {
 456:         if (a.getModifier() == DTDConstants.REQUIRED)
 457:           if (parameters.getAttribute(a.getName()) == null)
 458:             {
 459:               s_error("Missing required attribute '" + a.getName() + "' for <" +
 460:                       tag.getHTMLTag() + ">"
 461:                      );
 462:             }
 463:         a = a.next;
 464:       }
 465:   }
 466: 
 467:   private node getCurrentContentModel()
 468:   {
 469:     if (!stack.isEmpty())
 470:       {
 471:         hTag last = (hTag) stack.getLast();
 472:         return last.validationTrace;
 473:       }
 474:     else
 475:       return null;
 476:   }
 477: 
 478:   private void closeLast()
 479:   {
 480:     handleSupposedEndTag(((hTag) stack.getLast()).element);
 481:     stack.removeLast();
 482:   }
 483: 
 484:   private void openFictionalTag(Element e)
 485:   {
 486:     handleSupposedStartTag(e);
 487:     stack.add(new hTag(new TagElement(e, true)));
 488:     if (!e.omitStart())
 489:       s_error("<" + e + "> is expected (supposing it)");
 490:   }
 491: 
 492:   private void validateAttribute(TagElement tag, htmlAttributeSet parameters,
 493:                                  Enumeration enumeration
 494:                                 )
 495:   {
 496:     Object foundAttribute;
 497:     AttributeList dtdAttribute;
 498:     foundAttribute = enumeration.nextElement();
 499:     dtdAttribute = tag.getElement().getAttribute(foundAttribute.toString());
 500:     if (dtdAttribute == null)
 501:       {
 502:         StringBuffer valid =
 503:           new StringBuffer("The tag <" + tag.getHTMLTag() +
 504:                            "> cannot contain the attribute '" + foundAttribute +
 505:                            "'. The valid attributes for this tag are: "
 506:                           );
 507: 
 508:         AttributeList a = tag.getElement().getAttributes();
 509: 
 510:         while (a != null)
 511:           {
 512:             valid.append(a.name.toUpperCase());
 513:             valid.append(' ');
 514:             a = a.next;
 515:           }
 516:         s_error(valid.toString());
 517:       }
 518: 
 519:     else
 520:       {
 521:         String value = parameters.getAttribute(foundAttribute).toString();
 522: 
 523:         if (dtdAttribute.type == DTDConstants.NUMBER)
 524:           validateNumberAttribute(tag, foundAttribute, value);
 525: 
 526:         if (dtdAttribute.type == DTDConstants.NAME ||
 527:             dtdAttribute.type == DTDConstants.ID
 528:            )
 529:           validateNameOrIdAttribute(tag, foundAttribute, value);
 530: 
 531:         if (dtdAttribute.values != null)
 532:           validateAttributeWithValueList(tag, foundAttribute, dtdAttribute,
 533:                                          value
 534:                                         );
 535:       }
 536:   }
 537: 
 538:   private void validateAttributeWithValueList(TagElement tag,
 539:                                               Object foundAttribute,
 540:                                               AttributeList dtdAttribute,
 541:                                               String value
 542:                                              )
 543:   {
 544:     if (!dtdAttribute.values.contains(value.toLowerCase()) &&
 545:         !dtdAttribute.values.contains(value.toUpperCase())
 546:        )
 547:       {
 548:         StringBuffer valid;
 549:         if (dtdAttribute.values.size() == 1)
 550:           valid =
 551:             new StringBuffer("The attribute '" + foundAttribute +
 552:                              "' of the tag <" + tag.getHTMLTag() +
 553:                              "> cannot have the value '" + value +
 554:                              "'. The only valid value is "
 555:                             );
 556:         else
 557:           valid =
 558:             new StringBuffer("The attribute '" + foundAttribute +
 559:                              "' of the tag <" + tag.getHTMLTag() +
 560:                              "> cannot have the value '" + value + "'. The " +
 561:                              dtdAttribute.values.size() +
 562:                              " valid values are: "
 563:                             );
 564: 
 565:         Enumeration vv = dtdAttribute.values.elements();
 566:         while (vv.hasMoreElements())
 567:           {
 568:             valid.append('"');
 569:             valid.append(vv.nextElement());
 570:             valid.append("\"  ");
 571:           }
 572:         s_error(valid.toString());
 573:       }
 574:   }
 575: 
 576:   private void validateNameOrIdAttribute(TagElement tag, Object foundAttribute,
 577:                                          String value
 578:                                         )
 579:   {
 580:     boolean ok = true;
 581: 
 582:     if (!Character.isLetter(value.charAt(0)))
 583:       ok = false;
 584: 
 585:     char c;
 586:     for (int i = 0; i < value.length(); i++)
 587:       {
 588:         c = value.charAt(i);
 589:         if (!(
 590:               Character.isLetter(c) || Character.isDigit(c) ||
 591:               "".indexOf(c) >= 0
 592:             )
 593:            )
 594:           ok = false;
 595:       }
 596:     if (!ok)
 597:       s_error("The '" + foundAttribute + "' attribute of the tag <" +
 598:               tag.getHTMLTag() + "> must start from letter and consist of " +
 599:               "letters, digits, hypens, colons, underscores and periods. " +
 600:               "It cannot be '" + value + "'"
 601:              );
 602:   }
 603: 
 604:   private void validateNumberAttribute(TagElement tag, Object foundAttribute,
 605:                                        String value
 606:                                       )
 607:   {
 608:     try
 609:       {
 610:         Integer.parseInt(value);
 611:       }
 612:     catch (NumberFormatException ex)
 613:       {
 614:         s_error("The '" + foundAttribute + "' attribute of the tag <" +
 615:                 tag.getHTMLTag() + "> must be a valid number and not '" +
 616:                 value + "'"
 617:                );
 618:       }
 619:   }
 620: }