001 /* 002 * Copyright 2005 John G. Wilson 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 */ 017 018 package groovy.util; 019 020 import groovy.util.slurpersupport.GPathResult; 021 import groovy.util.slurpersupport.Node; 022 import groovy.util.slurpersupport.NodeChild; 023 import groovy.xml.FactorySupport; 024 025 import java.io.File; 026 import java.io.FileInputStream; 027 import java.io.IOException; 028 import java.io.InputStream; 029 import java.io.Reader; 030 import java.io.StringReader; 031 import java.net.URL; 032 import java.util.HashMap; 033 import java.util.Hashtable; 034 import java.util.Map; 035 import java.util.Stack; 036 037 import javax.xml.parsers.ParserConfigurationException; 038 import javax.xml.parsers.SAXParser; 039 import javax.xml.parsers.SAXParserFactory; 040 041 import org.xml.sax.Attributes; 042 import org.xml.sax.DTDHandler; 043 import org.xml.sax.EntityResolver; 044 import org.xml.sax.ErrorHandler; 045 import org.xml.sax.InputSource; 046 import org.xml.sax.SAXException; 047 import org.xml.sax.SAXNotRecognizedException; 048 import org.xml.sax.SAXNotSupportedException; 049 import org.xml.sax.XMLReader; 050 import org.xml.sax.helpers.DefaultHandler; 051 052 /** 053 * @author John Wilson 054 * 055 */ 056 057 public class XmlSlurper extends DefaultHandler { 058 private final XMLReader reader; 059 private Node currentNode = null; 060 private final Stack stack = new Stack(); 061 private final StringBuffer charBuffer = new StringBuffer(); 062 private final Map namespaceTagHints = new Hashtable(); 063 private boolean keepWhitespace = false; 064 065 public XmlSlurper() throws ParserConfigurationException, SAXException { 066 this(false, true); 067 } 068 069 public XmlSlurper(final boolean validating, final boolean namespaceAware) throws ParserConfigurationException, SAXException { 070 SAXParserFactory factory = FactorySupport.createSaxParserFactory(); 071 factory.setNamespaceAware(namespaceAware); 072 factory.setValidating(validating); 073 this.reader = factory.newSAXParser().getXMLReader(); 074 } 075 076 public XmlSlurper(final XMLReader reader) { 077 this.reader = reader; 078 } 079 080 public XmlSlurper(final SAXParser parser) throws SAXException { 081 this(parser.getXMLReader()); 082 } 083 084 /** 085 * @param keepWhitespace 086 * 087 * If true then whitespace before elements is kept. 088 * The deafult is to discard the whitespace. 089 */ 090 public void setKeepWhitespace(boolean keepWhitespace) { 091 this.keepWhitespace = keepWhitespace; 092 } 093 094 /** 095 * @return The GPathResult instance created by consuming a stream of SAX events 096 * Note if one of the parse methods has been called then this returns null 097 * Note if this is called more than once all calls after the first will return null 098 * 099 */ 100 public GPathResult getDocument() { 101 try { 102 return new NodeChild(this.currentNode, null, this.namespaceTagHints); 103 } finally { 104 this.currentNode = null; 105 } 106 } 107 108 /** 109 * Parse the content of the specified input source into a GPathResult object 110 * 111 * @param input 112 * @return An object which supports GPath expressions 113 * @throws IOException 114 * @throws SAXException 115 */ 116 public GPathResult parse(final InputSource input) throws IOException, SAXException { 117 this.reader.setContentHandler(this); 118 this.reader.parse(input); 119 120 return getDocument(); 121 122 } 123 124 /** 125 * Parses the content of the given file as XML turning it into a GPathResult object 126 * 127 * @param file 128 * @return An object which supports GPath expressions 129 * @throws IOException 130 * @throws SAXException 131 */ 132 public GPathResult parse(final File file) throws IOException, SAXException { 133 final InputSource input = new InputSource(new FileInputStream(file)); 134 135 input.setSystemId("file://" + file.getAbsolutePath()); 136 137 return parse(input); 138 139 } 140 141 /** 142 * Parse the content of the specified input stream into an GPathResult Object. 143 * Note that using this method will not provide the parser with any URI 144 * for which to find DTDs etc 145 * 146 * @param input 147 * @return An object which supports GPath expressions 148 * @throws IOException 149 * @throws SAXException 150 */ 151 public GPathResult parse(final InputStream input) throws IOException, SAXException { 152 return parse(new InputSource(input)); 153 } 154 155 /** 156 * Parse the content of the specified reader into a GPathResult Object. 157 * Note that using this method will not provide the parser with any URI 158 * for which to find DTDs etc 159 * 160 * @param in 161 * @return An object which supports GPath expressions 162 * @throws IOException 163 * @throws SAXException 164 */ 165 public GPathResult parse(final Reader in) throws IOException, SAXException { 166 return parse(new InputSource(in)); 167 } 168 169 /** 170 * Parse the content of the specified URI into a GPathResult Object 171 * 172 * @param uri 173 * @return An object which supports GPath expressions 174 * @throws IOException 175 * @throws SAXException 176 */ 177 public GPathResult parse(final String uri) throws IOException, SAXException { 178 return parse(new InputSource(uri)); 179 } 180 181 /** 182 * A helper method to parse the given text as XML 183 * 184 * @param text 185 * @return An object which supports GPath expressions 186 */ 187 public GPathResult parseText(final String text) throws IOException, SAXException { 188 return parse(new StringReader(text)); 189 } 190 191 // Delegated XMLReader methods 192 //------------------------------------------------------------------------ 193 194 /* (non-Javadoc) 195 * @see org.xml.sax.XMLReader#getDTDHandler() 196 */ 197 public DTDHandler getDTDHandler() { 198 return this.reader.getDTDHandler(); 199 } 200 201 /* (non-Javadoc) 202 * @see org.xml.sax.XMLReader#getEntityResolver() 203 */ 204 public EntityResolver getEntityResolver() { 205 return this.reader.getEntityResolver(); 206 } 207 208 /* (non-Javadoc) 209 * @see org.xml.sax.XMLReader#getErrorHandler() 210 */ 211 public ErrorHandler getErrorHandler() { 212 return this.reader.getErrorHandler(); 213 } 214 215 /* (non-Javadoc) 216 * @see org.xml.sax.XMLReader#getFeature(java.lang.String) 217 */ 218 public boolean getFeature(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException { 219 return this.reader.getFeature(uri); 220 } 221 222 /* (non-Javadoc) 223 * @see org.xml.sax.XMLReader#getProperty(java.lang.String) 224 */ 225 public Object getProperty(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException { 226 return this.reader.getProperty(uri); 227 } 228 229 /* (non-Javadoc) 230 * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler) 231 */ 232 public void setDTDHandler(final DTDHandler dtdHandler) { 233 this.reader.setDTDHandler(dtdHandler); 234 } 235 236 /* (non-Javadoc) 237 * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver) 238 */ 239 public void setEntityResolver(final EntityResolver entityResolver) { 240 this.reader.setEntityResolver(entityResolver); 241 } 242 243 /** 244 * Resolves entities against using the suppied URL as the base for relative URLs 245 * 246 * @param base 247 * The URL used to resolve relative URLs 248 */ 249 public void setEntityBaseUrl(final URL base) { 250 this.reader.setEntityResolver(new EntityResolver() { 251 public InputSource resolveEntity(final String publicId, final String systemId) throws IOException { 252 return new InputSource(new URL(base, systemId).openStream()); 253 } 254 }); 255 } 256 257 /* (non-Javadoc) 258 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) 259 */ 260 public void setErrorHandler(final ErrorHandler errorHandler) { 261 this.reader.setErrorHandler(errorHandler); 262 } 263 264 /* (non-Javadoc) 265 * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean) 266 */ 267 public void setFeature(final String uri, final boolean value) throws SAXNotRecognizedException, SAXNotSupportedException { 268 this.reader.setFeature(uri, value); 269 } 270 271 /* (non-Javadoc) 272 * @see org.xml.sax.XMLReader#setProperty(java.lang.String, java.lang.Object) 273 */ 274 public void setProperty(final String uri, final Object value) throws SAXNotRecognizedException, SAXNotSupportedException { 275 this.reader.setProperty(uri, value); 276 } 277 278 279 // ContentHandler interface 280 //------------------------------------------------------------------------- 281 282 /* (non-Javadoc) 283 * @see org.xml.sax.ContentHandler#startDocument() 284 */ 285 public void startDocument() throws SAXException { 286 this.currentNode = null; 287 this.charBuffer.setLength(0); 288 } 289 290 /* (non-Javadoc) 291 * @see org.xml.sax.helpers.DefaultHandler#startPrefixMapping(java.lang.String, java.lang.String) 292 */ 293 public void startPrefixMapping(final String tag, final String uri) throws SAXException { 294 this.namespaceTagHints.put(tag, uri); 295 } 296 297 /* (non-Javadoc) 298 * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes) 299 */ 300 public void startElement(final String namespaceURI, final String localName, final String qName, final Attributes atts) throws SAXException { 301 addCdata(); 302 303 final Map attributes = new HashMap(); 304 final Map attributeNamespaces = new HashMap(); 305 306 for (int i = atts.getLength() - 1; i != -1; i--) { 307 if (atts.getURI(i).length() == 0) { 308 attributes.put(atts.getQName(i), atts.getValue(i)); 309 } else { 310 attributes.put(atts.getLocalName(i), atts.getValue(i)); 311 attributeNamespaces.put(atts.getLocalName(i), atts.getURI(i)); 312 } 313 314 } 315 316 final Node newElement; 317 318 if (namespaceURI.length() == 0){ 319 newElement = new Node(this.currentNode, qName, attributes, attributeNamespaces, namespaceURI); 320 } else { 321 newElement = new Node(this.currentNode, localName, attributes, attributeNamespaces, namespaceURI); 322 } 323 324 if (this.currentNode != null) { 325 this.currentNode.addChild(newElement); 326 } 327 328 this.stack.push(this.currentNode); 329 this.currentNode = newElement; 330 } 331 332 /* (non-Javadoc) 333 * @see org.xml.sax.ContentHandler#characters(char[], int, int) 334 */ 335 public void characters(final char[] ch, final int start, final int length) throws SAXException { 336 this.charBuffer.append(ch, start, length); 337 } 338 339 /* (non-Javadoc) 340 * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String) 341 */ 342 public void endElement(final String namespaceURI, final String localName, final String qName) throws SAXException { 343 addCdata(); 344 345 final Object oldCurrentNode = this.stack.pop(); 346 347 if (oldCurrentNode != null) { 348 this.currentNode = (Node)oldCurrentNode; 349 } 350 } 351 352 /* (non-Javadoc) 353 * @see org.xml.sax.ContentHandler#endDocument() 354 */ 355 public void endDocument() throws SAXException { 356 } 357 358 // Implementation methods 359 //------------------------------------------------------------------------- 360 361 /** 362 * 363 */ 364 private void addCdata() { 365 if (this.charBuffer.length() != 0) { 366 // 367 // This element is preceeded by CDATA if keepWhitespace is false (the default setting) and 368 // it's not whitespace add it to the body 369 // Note that, according to the XML spec, we should preserve the CDATA if it's all whitespace 370 // but for the sort of work I'm doing ignoring the whitespace is preferable 371 // 372 final String cdata = this.charBuffer.toString(); 373 374 this.charBuffer.setLength(0); 375 if (this.keepWhitespace || cdata.trim().length() != 0) { 376 this.currentNode.addChild(cdata); 377 } 378 } 379 } 380 }