001 package com.mockrunner.util.web; 002 003 import java.io.StringReader; 004 import java.io.StringWriter; 005 import java.util.List; 006 007 import org.apache.commons.logging.Log; 008 import org.apache.commons.logging.LogFactory; 009 import org.apache.xerces.parsers.DOMParser; 010 import org.cyberneko.html.HTMLConfiguration; 011 import org.jdom.Element; 012 import org.jdom.input.DOMBuilder; 013 import org.jdom.output.XMLOutputter; 014 import org.xml.sax.InputSource; 015 016 import com.mockrunner.base.NestedApplicationException; 017 018 /** 019 * Util class for HTML and XML parsing. 020 */ 021 public class XmlUtil 022 { 023 private final static Log log = LogFactory.getLog(XmlUtil.class); 024 025 /** 026 * Convinience method for HTML fragments. Returns the body 027 * as JDOM <code>Element</code>. 028 * 029 * If an HTML documents looks like this: 030 * <pre> 031 * <html> 032 * <head> 033 * </head> 034 * <body> 035 * <h1> 036 * </h1> 037 * </body> 038 * </html> 039 * </pre> 040 * 041 * the method returns the h1 tag as <code>Element</code>. 042 * @param document the <code>org.jdom.Document</code> 043 * @return the body <code>Element</code> 044 */ 045 public static Element getBodyFragmentFromJDOMDocument(org.jdom.Document document) 046 { 047 Element element = document.getRootElement().getChild("BODY"); 048 if(null == element) 049 { 050 element = document.getRootElement().getChild("body"); 051 } 052 if(null != element) 053 { 054 List childs = element.getChildren(); 055 if(null != childs && childs.size() > 0) return (Element)childs.get(0); 056 } 057 return null; 058 } 059 060 /** 061 * @deprecated use {@link #getBodyFragmentFromJDOMDocument} 062 */ 063 public static Element getBodyFragmentJDOMDocument(org.jdom.Document document) 064 { 065 return getBodyFragmentFromJDOMDocument(document); 066 } 067 068 /** 069 * Returns the documents XML content as a string. 070 * @param document the <code>org.jdom.Document</code> 071 * @return the output as string 072 */ 073 public static String createStringFromJDOMDocument(org.jdom.Document document) 074 { 075 try 076 { 077 XMLOutputter outputter = new XMLOutputter(); 078 StringWriter writer = new StringWriter(); 079 outputter.output(document, writer); 080 writer.flush(); 081 return writer.toString(); 082 } 083 catch(Exception exc) 084 { 085 log.error(exc.getMessage(), exc); 086 throw new NestedApplicationException(exc); 087 } 088 } 089 090 /** 091 * Creates a JDOM <code>Document</code> from a specified 092 * W3C <code>Document</code>. 093 * @param document the <code>org.w3c.dom.Document</code> 094 * @return the <code>org.jdom.Document</code> 095 */ 096 public static org.jdom.Document createJDOMDocument(org.w3c.dom.Document document) 097 { 098 return new DOMBuilder().build(document); 099 } 100 101 /** 102 * Returns a parser suitable for parsing HTML documents. 103 * The NekoHTML parser is used with some settings to 104 * preserve case of tag names and disable namespace processing. 105 * This method is used by {@link #parseHTML}. 106 * @return instance of <code>org.apache.xerces.parsers.DOMParser</code> 107 * with Neko configuration 108 */ 109 public static DOMParser getHTMLParser() 110 { 111 try 112 { 113 HTMLConfiguration config = new HTMLConfiguration(); 114 config.setProperty("http://cyberneko.org/html/properties/names/elems", "match"); 115 config.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change"); 116 DOMParser parser = new DOMParser(config); 117 return parser; 118 } 119 catch(Exception exc) 120 { 121 log.error(exc.getMessage(), exc); 122 throw new NestedApplicationException(exc); 123 } 124 } 125 126 /** 127 * Parses the specified HTML with the NekoHTML parser. 128 * If you want to use another HTML parser or configure 129 * the NekoHTML parser with special features, you can use 130 * the <code>parse</code> method. 131 * @param source the HTML as String 132 * @return the parsed document as org.w3c.dom.Document 133 */ 134 public static org.w3c.dom.Document parseHTML(String source) 135 { 136 try 137 { 138 return parse(getHTMLParser(), source); 139 } 140 catch(Exception exc) 141 { 142 log.error(exc.getMessage(), exc); 143 throw new NestedApplicationException(exc); 144 } 145 } 146 147 /** 148 * Parses the specified XML with the specified parser. 149 * The main purpose of this method is to use the NekoHTML 150 * parser with custom features and properties. If you can live 151 * with the settings provided by Mockrunner, you can use 152 * {@link #parseHTML}. 153 * @param parser the parser (must extend 154 * <code>org.apache.xerces.parsers.DOMParser</code>), 155 * e.g. the one returned by {@link #getHTMLParser} 156 * @param source the XML as String 157 * @return the parsed document as org.w3c.dom.Document 158 */ 159 public static org.w3c.dom.Document parse(DOMParser parser, String source) 160 { 161 try 162 { 163 parser.parse(new InputSource(new StringReader(source))); 164 return parser.getDocument(); 165 } 166 catch(Exception exc) 167 { 168 log.error(exc.getMessage(), exc); 169 throw new NestedApplicationException(exc); 170 } 171 } 172 }