001 /* 002 * Copyright 2005 John G. Wilson 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 */ 017 018 package groovy.util; 019 020 import groovy.util.slurpersupport.GPathResult; 021 import groovy.util.slurpersupport.Node; 022 import groovy.util.slurpersupport.NodeChild; 023 024 import java.io.File; 025 import java.io.FileInputStream; 026 import java.io.IOException; 027 import java.io.InputStream; 028 import java.io.Reader; 029 import java.io.StringReader; 030 import java.net.URL; 031 import java.security.AccessController; 032 import java.security.PrivilegedActionException; 033 import java.security.PrivilegedExceptionAction; 034 import java.util.HashMap; 035 import java.util.Hashtable; 036 import java.util.Map; 037 import java.util.Stack; 038 039 import javax.xml.parsers.ParserConfigurationException; 040 import javax.xml.parsers.SAXParser; 041 import javax.xml.parsers.SAXParserFactory; 042 043 import org.xml.sax.Attributes; 044 import org.xml.sax.DTDHandler; 045 import org.xml.sax.EntityResolver; 046 import org.xml.sax.ErrorHandler; 047 import org.xml.sax.InputSource; 048 import org.xml.sax.SAXException; 049 import org.xml.sax.SAXNotRecognizedException; 050 import org.xml.sax.SAXNotSupportedException; 051 import org.xml.sax.XMLReader; 052 import org.xml.sax.helpers.DefaultHandler; 053 054 /** 055 * @author John Wilson 056 * 057 */ 058 059 public class XmlSlurper extends DefaultHandler { 060 private final XMLReader reader; 061 private Node currentNode = null; 062 private final Stack stack = new Stack(); 063 private final StringBuffer charBuffer = new StringBuffer(); 064 private final Map namespaceTagHints = new Hashtable(); 065 066 public XmlSlurper() throws ParserConfigurationException, SAXException { 067 this(false, true); 068 } 069 070 public XmlSlurper(final boolean validating, final boolean namespaceAware) throws ParserConfigurationException, SAXException { 071 SAXParserFactory factory = null; 072 073 try { 074 factory = (SAXParserFactory) AccessController.doPrivileged(new PrivilegedExceptionAction() { 075 public Object run() throws ParserConfigurationException { 076 return SAXParserFactory.newInstance(); 077 } 078 }); 079 } catch (final PrivilegedActionException pae) { 080 final Exception e = pae.getException(); 081 082 if (e instanceof ParserConfigurationException) { 083 throw (ParserConfigurationException) e; 084 } else { 085 throw new RuntimeException(e); 086 } 087 } 088 factory.setNamespaceAware(namespaceAware); 089 factory.setValidating(validating); 090 091 final SAXParser parser = factory.newSAXParser(); 092 this.reader = parser.getXMLReader(); 093 } 094 095 public XmlSlurper(final XMLReader reader) { 096 this.reader = reader; 097 } 098 099 public XmlSlurper(final SAXParser parser) throws SAXException { 100 this(parser.getXMLReader()); 101 } 102 103 /** 104 * @return The GPathResult instance created by consuming a stream of SAX events 105 * Note if one of the parse methods has been called then this returns null 106 * Note if this is called more than once all calls after the first will return null 107 * 108 */ 109 public GPathResult getDocument() { 110 try { 111 return new NodeChild(this.currentNode, null, this.namespaceTagHints); 112 } finally { 113 this.currentNode = null; 114 } 115 } 116 117 /** 118 * Parse the content of the specified input source into a GPathResult object 119 * 120 * @param input 121 * @return An object which supports GPath expressions 122 * @throws IOException 123 * @throws SAXException 124 */ 125 public GPathResult parse(final InputSource input) throws IOException, SAXException { 126 this.reader.setContentHandler(this); 127 this.reader.parse(input); 128 129 return getDocument(); 130 131 } 132 133 /** 134 * Parses the content of the given file as XML turning it into a GPathResult object 135 * 136 * @param file 137 * @return An object which supports GPath expressions 138 * @throws IOException 139 * @throws SAXException 140 */ 141 public GPathResult parse(final File file) throws IOException, SAXException { 142 final InputSource input = new InputSource(new FileInputStream(file)); 143 144 input.setSystemId("file://" + file.getAbsolutePath()); 145 146 return parse(input); 147 148 } 149 150 /** 151 * Parse the content of the specified input stream into an GPathResult Object. 152 * Note that using this method will not provide the parser with any URI 153 * for which to find DTDs etc 154 * 155 * @param input 156 * @return An object which supports GPath expressions 157 * @throws IOException 158 * @throws SAXException 159 */ 160 public GPathResult parse(final InputStream input) throws IOException, SAXException { 161 return parse(new InputSource(input)); 162 } 163 164 /** 165 * Parse the content of the specified reader into a GPathResult Object. 166 * Note that using this method will not provide the parser with any URI 167 * for which to find DTDs etc 168 * 169 * @param in 170 * @return An object which supports GPath expressions 171 * @throws IOException 172 * @throws SAXException 173 */ 174 public GPathResult parse(final Reader in) throws IOException, SAXException { 175 return parse(new InputSource(in)); 176 } 177 178 /** 179 * Parse the content of the specified URI into a GPathResult Object 180 * 181 * @param uri 182 * @return An object which supports GPath expressions 183 * @throws IOException 184 * @throws SAXException 185 */ 186 public GPathResult parse(final String uri) throws IOException, SAXException { 187 return parse(new InputSource(uri)); 188 } 189 190 /** 191 * A helper method to parse the given text as XML 192 * 193 * @param text 194 * @return An object which supports GPath expressions 195 */ 196 public GPathResult parseText(final String text) throws IOException, SAXException { 197 return parse(new StringReader(text)); 198 } 199 200 // Delegated XMLReader methods 201 //------------------------------------------------------------------------ 202 203 /* (non-Javadoc) 204 * @see org.xml.sax.XMLReader#getDTDHandler() 205 */ 206 public DTDHandler getDTDHandler() { 207 return this.reader.getDTDHandler(); 208 } 209 210 /* (non-Javadoc) 211 * @see org.xml.sax.XMLReader#getEntityResolver() 212 */ 213 public EntityResolver getEntityResolver() { 214 return this.reader.getEntityResolver(); 215 } 216 217 /* (non-Javadoc) 218 * @see org.xml.sax.XMLReader#getErrorHandler() 219 */ 220 public ErrorHandler getErrorHandler() { 221 return this.reader.getErrorHandler(); 222 } 223 224 /* (non-Javadoc) 225 * @see org.xml.sax.XMLReader#getFeature(java.lang.String) 226 */ 227 public boolean getFeature(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException { 228 return this.reader.getFeature(uri); 229 } 230 231 /* (non-Javadoc) 232 * @see org.xml.sax.XMLReader#getProperty(java.lang.String) 233 */ 234 public Object getProperty(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException { 235 return this.reader.getProperty(uri); 236 } 237 238 /* (non-Javadoc) 239 * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler) 240 */ 241 public void setDTDHandler(final DTDHandler dtdHandler) { 242 this.reader.setDTDHandler(dtdHandler); 243 } 244 245 /* (non-Javadoc) 246 * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver) 247 */ 248 public void setEntityResolver(final EntityResolver entityResolver) { 249 this.reader.setEntityResolver(entityResolver); 250 } 251 252 /** 253 * Resolves entities against using the suppied URL as the base for relative URLs 254 * 255 * @param base 256 * The URL used to resolve relative URLs 257 */ 258 public void setEntityBaseUrl(final URL base) { 259 this.reader.setEntityResolver(new EntityResolver() { 260 public InputSource resolveEntity(final String publicId, final String systemId) throws IOException { 261 return new InputSource(new URL(base, systemId).openStream()); 262 } 263 }); 264 } 265 266 /* (non-Javadoc) 267 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) 268 */ 269 public void setErrorHandler(final ErrorHandler errorHandler) { 270 this.reader.setErrorHandler(errorHandler); 271 } 272 273 /* (non-Javadoc) 274 * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean) 275 */ 276 public void setFeature(final String uri, final boolean value) throws SAXNotRecognizedException, SAXNotSupportedException { 277 this.reader.setFeature(uri, value); 278 } 279 280 /* (non-Javadoc) 281 * @see org.xml.sax.XMLReader#setProperty(java.lang.String, java.lang.Object) 282 */ 283 public void setProperty(final String uri, final Object value) throws SAXNotRecognizedException, SAXNotSupportedException { 284 this.reader.setProperty(uri, value); 285 } 286 287 288 // ContentHandler interface 289 //------------------------------------------------------------------------- 290 291 /* (non-Javadoc) 292 * @see org.xml.sax.ContentHandler#startDocument() 293 */ 294 public void startDocument() throws SAXException { 295 this.currentNode = null; 296 this.charBuffer.setLength(0); 297 } 298 299 /* (non-Javadoc) 300 * @see org.xml.sax.helpers.DefaultHandler#startPrefixMapping(java.lang.String, java.lang.String) 301 */ 302 public void startPrefixMapping(final String tag, final String uri) throws SAXException { 303 this.namespaceTagHints.put(tag, uri); 304 } 305 306 /* (non-Javadoc) 307 * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes) 308 */ 309 public void startElement(final String namespaceURI, final String localName, final String qName, final Attributes atts) throws SAXException { 310 addNonWhitespaceCdata(); 311 312 final Map attributes = new HashMap(); 313 final Map attributeNamespaces = new HashMap(); 314 315 for (int i = atts.getLength() - 1; i != -1; i--) { 316 if (atts.getURI(i).length() == 0) { 317 attributes.put(atts.getQName(i), atts.getValue(i)); 318 } else { 319 attributes.put(atts.getLocalName(i), atts.getValue(i)); 320 attributeNamespaces.put(atts.getLocalName(i), atts.getURI(i)); 321 } 322 323 } 324 325 final Node newElement; 326 327 if (namespaceURI.length() == 0){ 328 newElement = new Node(this.currentNode, qName, attributes, attributeNamespaces, namespaceURI); 329 } else { 330 newElement = new Node(this.currentNode, localName, attributes, attributeNamespaces, namespaceURI); 331 } 332 333 if (this.currentNode != null) { 334 this.currentNode.addChild(newElement); 335 } 336 337 this.stack.push(this.currentNode); 338 this.currentNode = newElement; 339 } 340 341 /* (non-Javadoc) 342 * @see org.xml.sax.ContentHandler#characters(char[], int, int) 343 */ 344 public void characters(final char[] ch, final int start, final int length) throws SAXException { 345 this.charBuffer.append(ch, start, length); 346 } 347 348 /* (non-Javadoc) 349 * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String) 350 */ 351 public void endElement(final String namespaceURI, final String localName, final String qName) throws SAXException { 352 addNonWhitespaceCdata(); 353 354 final Object oldCurrentNode = this.stack.pop(); 355 356 if (oldCurrentNode != null) { 357 this.currentNode = (Node)oldCurrentNode; 358 } 359 } 360 361 /* (non-Javadoc) 362 * @see org.xml.sax.ContentHandler#endDocument() 363 */ 364 public void endDocument() throws SAXException { 365 } 366 367 // Implementation methods 368 //------------------------------------------------------------------------- 369 370 /** 371 * 372 */ 373 private void addNonWhitespaceCdata() { 374 if (this.charBuffer.length() != 0) { 375 // 376 // This element is preceeded by CDATA if it's not whitespace add it to the body 377 // Note that, according to the XML spec, we should preserve the CDATA if it's all whitespace 378 // but for the sort of work I'm doing ignoring the whitespace is preferable 379 // 380 final String cdata = this.charBuffer.toString(); 381 382 this.charBuffer.setLength(0); 383 if (cdata.trim().length() != 0) { 384 this.currentNode.addChild(cdata); 385 } 386 } 387 } 388 }