GNU Classpath (0.17) | ||
Frames | No Frames |
1: /* DocumentParser.java -- A parser for HTML documents. 2: Copyright (C) 2005 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: 39: package javax.swing.text.html.parser; 40: 41: import gnu.javax.swing.text.html.parser.htmlAttributeSet; 42: import javax.swing.text.html.parser.Parser; 43: 44: import java.io.IOException; 45: import java.io.Reader; 46: 47: import javax.swing.text.BadLocationException; 48: import javax.swing.text.html.HTMLEditorKit; 49: 50: /** 51: * <p>A simple error-tolerant HTML parser that uses a DTD document 52: * to access data on the possible tokens, arguments and syntax.</p> 53: * <p> The parser reads an HTML content from a Reader and calls various 54: * notifying methods (which should be overridden in a subclass) 55: * when tags or data are encountered.</p> 56: * <p>Some HTML elements need no opening or closing tags. The 57: * task of this parser is to invoke the tag handling methods also when 58: * the tags are not explicitly specified and must be supposed using 59: * information, stored in the DTD. 60: * For example, parsing the document 61: * <p><table><tr><td>a<td>b<td>c</tr> <br> 62: * will invoke exactly the handling methods exactly in the same order 63: * (and with the same parameters) as if parsing the document: <br> 64: * <em><html><head></head><body><table>< 65: * tbody></em><tr><td>a<em></td></em><td>b<em> 66: * </td></em><td>c<em></td></tr></em>< 67: * <em>/tbody></table></body></html></em></p> 68: * (supposed tags are given in italics). The parser also supports 69: * obsolete elements of HTML syntax.<p> 70: * </p> 71: * In this implementation, DocumentParser is directly derived from its 72: * ancestor without changes of functionality. 73: * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org) 74: */ 75: public class DocumentParser 76: extends Parser 77: implements DTDConstants 78: { 79: /** 80: * The enclosed working parser class. 81: */ 82: private class gnuParser 83: extends gnu.javax.swing.text.html.parser.support.Parser 84: { 85: private gnuParser(DTD d) 86: { 87: super(d); 88: } 89: 90: protected final void handleComment(char[] comment) 91: { 92: parser.handleComment(comment); 93: callBack.handleComment(comment, hTag.where.startPosition); 94: } 95: 96: protected final void handleEmptyTag(TagElement tag) 97: throws javax.swing.text.ChangedCharSetException 98: { 99: parser.handleEmptyTag(tag); 100: callBack.handleSimpleTag(tag.getHTMLTag(), getAttributes(), 101: hTag.where.startPosition 102: ); 103: } 104: 105: protected final void handleEndTag(TagElement tag) 106: { 107: parser.handleEndTag(tag); 108: callBack.handleEndTag(tag.getHTMLTag(), hTag.where.startPosition); 109: } 110: 111: protected final void handleError(int line, String message) 112: { 113: parser.handleError(line, message); 114: callBack.handleError(message, hTag.where.startPosition); 115: } 116: 117: protected final void handleStartTag(TagElement tag) 118: { 119: parser.handleStartTag(tag); 120: htmlAttributeSet attributes = gnu.getAttributes(); 121: 122: if (tag.fictional()) 123: attributes.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED, 124: Boolean.TRUE 125: ); 126: 127: callBack.handleStartTag(tag.getHTMLTag(), attributes, 128: hTag.where.startPosition 129: ); 130: } 131: 132: protected final void handleText(char[] text) 133: { 134: parser.handleText(text); 135: callBack.handleText(text, hTag.where.startPosition); 136: } 137: 138: DTD getDTD() 139: { 140: return dtd; 141: } 142: } 143: 144: /** 145: * This field is used to access the identically named 146: * methods of the outer class. 147: * This is package-private to avoid an accessor method. 148: */ 149: DocumentParser parser = this; 150: 151: /** 152: * The callback. 153: * This is package-private to avoid an accessor method. 154: */ 155: HTMLEditorKit.ParserCallback callBack; 156: 157: /** 158: * The reference to the working class of HTML parser that is 159: * actually used to parse the document. 160: * This is package-private to avoid an accessor method. 161: */ 162: gnuParser gnu; 163: 164: /** 165: * Creates a new parser that uses the given DTD to access data on the 166: * possible tokens, arguments and syntax. There is no single - step way 167: * to get a default DTD; you must either refer to the implementation - 168: * specific packages, write your own DTD or obtain the working instance 169: * of parser in other way, for example, by calling 170: * {@link javax.swing.text.html.HTMLEditorKit#getParser() }. 171: * @param a_dtd a DTD to use. 172: */ 173: public DocumentParser(DTD a_dtd) 174: { 175: super(a_dtd); 176: gnu = new gnuParser(a_dtd); 177: } 178: 179: /** 180: * Parses the HTML document, calling methods of the provided 181: * callback. This method must be multithread - safe. 182: * @param reader The reader to read the HTML document from 183: * @param callback The callback that is notifyed about the presence 184: * of HTML elements in the document. 185: * @param ignoreCharSet If thrue, any charset changes during parsing 186: * are ignored. 187: * @throws java.io.IOException 188: */ 189: public void parse(Reader reader, HTMLEditorKit.ParserCallback a_callback, 190: boolean ignoreCharSet 191: ) 192: throws IOException 193: { 194: callBack = a_callback; 195: gnu.parse(reader); 196: 197: callBack.handleEndOfLineString(gnu.getEndOfLineSequence()); 198: try 199: { 200: callBack.flush(); 201: } 202: catch (BadLocationException ex) 203: { 204: // Convert this into the supported type of exception. 205: throw new IOException(ex.getMessage()); 206: } 207: } 208: 209: /** 210: * Handle HTML comment. The default method returns without action. 211: * @param comment the comment being handled 212: */ 213: protected void handleComment(char[] comment) 214: { 215: } 216: 217: /** 218: * Handle the tag with no content, like <br>. The method is 219: * called for the elements that, in accordance with the current DTD, 220: * has an empty content. 221: * @param tag the tag being handled. 222: * @throws javax.swing.text.ChangedCharSetException 223: */ 224: protected void handleEmptyTag(TagElement tag) 225: throws javax.swing.text.ChangedCharSetException 226: { 227: } 228: 229: /** 230: * The method is called when the HTML closing tag ((like </table>) 231: * is found or if the parser concludes that the one should be present 232: * in the current position. 233: * @param The tag being handled 234: */ 235: protected void handleEndTag(TagElement tag) 236: { 237: } 238: 239: /* Handle error that has occured in the given line. */ 240: protected void handleError(int line, String message) 241: { 242: } 243: 244: /** 245: * The method is called when the HTML opening tag ((like <table>) 246: * is found or if the parser concludes that the one should be present 247: * in the current position. 248: * @param The tag being handled 249: */ 250: protected void handleStartTag(TagElement tag) 251: { 252: } 253: 254: /** 255: * Handle the text section. 256: * @param text a section text. 257: */ 258: protected void handleText(char[] text) 259: { 260: } 261: }
GNU Classpath (0.17) |