Package lxml :: Package html :: Module html5parser
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.html5parser

  1  """ 
  2  An interface to html5lib. 
  3  """ 
  4   
  5  import urllib 
  6  from html5lib import HTMLParser as _HTMLParser 
  7  from lxml import etree 
  8  from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE 
  9  from lxml.html._html5builder import TreeBuilder 
 10   
 11  # python3 compatibility 
 12  try: 
 13      _strings = basestring 
 14  except NameError: 
 15      _strings = (bytes, str) 
 16   
 17   
18 -class HTMLParser(_HTMLParser):
19 """An html5lib HTML parser with lxml as tree.""" 20
21 - def __init__(self, strict=False):
22 _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder)
23 24 25 try: 26 from html5lib import XHTMLParser as _XHTMLParser 27 except ImportError: 28 pass 29 else:
30 - class XHTMLParser(_XHTMLParser):
31 """An html5lib XHTML Parser with lxml as tree.""" 32
33 - def __init__(self, strict=False):
34 _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder)
35 36 xhtml_parser = XHTMLParser() 37 38
39 -def _find_tag(tree, tag):
40 elem = tree.find(tag) 41 if elem is not None: 42 return elem 43 return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
44 45
46 -def document_fromstring(html, guess_charset=True, parser=None):
47 """Parse a whole document into a string.""" 48 if not isinstance(html, _strings): 49 raise TypeError('string required') 50 51 if parser is None: 52 parser = html_parser 53 54 return parser.parse(html, useChardet=guess_charset).getroot()
55 56
57 -def fragments_fromstring(html, no_leading_text=False, 58 guess_charset=False, parser=None):
59 """Parses several HTML elements, returning a list of elements. 60 61 The first item in the list may be a string. If no_leading_text is true, 62 then it will be an error if there is leading text, and it will always be 63 a list of only elements. 64 65 If `guess_charset` is `True` and the text was not unicode but a 66 bytestring, the `chardet` library will perform charset guessing on the 67 string. 68 """ 69 if not isinstance(html, _strings): 70 raise TypeError('string required') 71 72 if parser is None: 73 parser = html_parser 74 75 children = parser.parseFragment(html, 'div', useChardet=guess_charset) 76 if children and isinstance(children[0], _strings): 77 if no_leading_text: 78 if children[0].strip(): 79 raise etree.ParserError('There is leading text: %r' % 80 children[0]) 81 del children[0] 82 return children
83 84
85 -def fragment_fromstring(html, create_parent=False, 86 guess_charset=False, parser=None):
87 """Parses a single HTML element; it is an error if there is more than 88 one element, or if anything but whitespace precedes or follows the 89 element. 90 91 If create_parent is true (or is a tag name) then a parent node 92 will be created to encapsulate the HTML in a single element. In 93 this case, leading or trailing text is allowed. 94 """ 95 if not isinstance(html, _strings): 96 raise TypeError('string required') 97 98 accept_leading_text = bool(create_parent) 99 100 elements = fragments_fromstring( 101 html, guess_charset=guess_charset, parser=parser, 102 no_leading_text=not accept_leading_text, **kw) 103 104 if create_parent: 105 if not isinstance(create_parent, basestring): 106 create_parent = 'div' 107 new_root = Element(create_parent) 108 if elements: 109 if isinstance(elements[0], basestring): 110 new_root.text = elements[0] 111 del elements[0] 112 new_root.extend(elements) 113 return new_root 114 115 if not elements: 116 raise etree.ParserError('No elements found') 117 if len(elements) > 1: 118 raise etree.ParserError('Multiple elements found') 119 result = elements[0] 120 if result.tail and result.tail.strip(): 121 raise etree.ParserError('Element followed by text: %r' % result.tail) 122 result.tail = None 123 return result
124 125
126 -def fromstring(html, guess_charset=True, parser=None):
127 """Parse the html, returning a single element/document. 128 129 This tries to minimally parse the chunk of text, without knowing if it 130 is a fragment or a document. 131 132 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 133 """ 134 if not isinstance(html, _strings): 135 raise TypeError('string required') 136 doc = document_fromstring(html, parser=parser, 137 guess_charset=guess_charset) 138 139 # document starts with doctype or <html>, full document! 140 start = html[:50].lstrip().lower() 141 if start.startswith('<html') or start.startswith('<!doctype'): 142 return doc 143 144 head = _find_tag(doc, 'head') 145 146 # if the head is not empty we have a full document 147 if len(head): 148 return doc 149 150 body = _find_tag(doc, 'body') 151 152 # The body has just one element, so it was probably a single 153 # element passed in 154 if (len(body) == 1 and (not body.text or not body.text.strip()) 155 and (not body[-1].tail or not body[-1].tail.strip())): 156 return body[0] 157 158 # Now we have a body which represents a bunch of tags which have the 159 # content that was passed in. We will create a fake container, which 160 # is the body tag, except <body> implies too much structure. 161 if _contains_block_level_tag(body): 162 body.tag = 'div' 163 else: 164 body.tag = 'span' 165 return body
166 167
168 -def parse(filename_url_or_file, guess_charset=True, parser=None):
169 """Parse a filename, URL, or file-like object into an HTML document 170 tree. Note: this returns a tree, not an element. Use 171 ``parse(...).getroot()`` to get the document root. 172 """ 173 if parser is None: 174 parser = html_parser 175 if isinstance(filename_url_or_file, basestring): 176 fp = urllib.urlopen(filename_url_or_file) 177 else: 178 fp = filename_url_or_file 179 return parser.parse(fp, useChardet=guess_charset)
180 181 182 html_parser = HTMLParser() 183