Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 
 
 

261 Zeilen
8.4 KiB

  1. """
  2. An interface to html5lib that mimics the lxml.html interface.
  3. """
  4. import sys
  5. import string
  6. from html5lib import HTMLParser as _HTMLParser
  7. from html5lib.treebuilders.etree_lxml import TreeBuilder
  8. from lxml import etree
  9. from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
  10. # python3 compatibility
  11. try:
  12. _strings = basestring
  13. except NameError:
  14. _strings = (bytes, str)
  15. try:
  16. from urllib2 import urlopen
  17. except ImportError:
  18. from urllib.request import urlopen
  19. try:
  20. from urlparse import urlparse
  21. except ImportError:
  22. from urllib.parse import urlparse
  23. class HTMLParser(_HTMLParser):
  24. """An html5lib HTML parser with lxml as tree."""
  25. def __init__(self, strict=False, **kwargs):
  26. _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
  27. try:
  28. from html5lib import XHTMLParser as _XHTMLParser
  29. except ImportError:
  30. pass
  31. else:
  32. class XHTMLParser(_XHTMLParser):
  33. """An html5lib XHTML Parser with lxml as tree."""
  34. def __init__(self, strict=False, **kwargs):
  35. _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
  36. xhtml_parser = XHTMLParser()
  37. def _find_tag(tree, tag):
  38. elem = tree.find(tag)
  39. if elem is not None:
  40. return elem
  41. return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
  42. def document_fromstring(html, guess_charset=None, parser=None):
  43. """
  44. Parse a whole document into a string.
  45. If `guess_charset` is true, or if the input is not Unicode but a
  46. byte string, the `chardet` library will perform charset guessing
  47. on the string.
  48. """
  49. if not isinstance(html, _strings):
  50. raise TypeError('string required')
  51. if parser is None:
  52. parser = html_parser
  53. options = {}
  54. if guess_charset is None and isinstance(html, bytes):
  55. # html5lib does not accept useChardet as an argument, if it
  56. # detected the html argument would produce unicode objects.
  57. guess_charset = True
  58. if guess_charset is not None:
  59. options['useChardet'] = guess_charset
  60. return parser.parse(html, **options).getroot()
  61. def fragments_fromstring(html, no_leading_text=False,
  62. guess_charset=None, parser=None):
  63. """Parses several HTML elements, returning a list of elements.
  64. The first item in the list may be a string. If no_leading_text is true,
  65. then it will be an error if there is leading text, and it will always be
  66. a list of only elements.
  67. If `guess_charset` is true, the `chardet` library will perform charset
  68. guessing on the string.
  69. """
  70. if not isinstance(html, _strings):
  71. raise TypeError('string required')
  72. if parser is None:
  73. parser = html_parser
  74. options = {}
  75. if guess_charset is None and isinstance(html, bytes):
  76. # html5lib does not accept useChardet as an argument, if it
  77. # detected the html argument would produce unicode objects.
  78. guess_charset = False
  79. if guess_charset is not None:
  80. options['useChardet'] = guess_charset
  81. children = parser.parseFragment(html, 'div', **options)
  82. if children and isinstance(children[0], _strings):
  83. if no_leading_text:
  84. if children[0].strip():
  85. raise etree.ParserError('There is leading text: %r' %
  86. children[0])
  87. del children[0]
  88. return children
  89. def fragment_fromstring(html, create_parent=False,
  90. guess_charset=None, parser=None):
  91. """Parses a single HTML element; it is an error if there is more than
  92. one element, or if anything but whitespace precedes or follows the
  93. element.
  94. If 'create_parent' is true (or is a tag name) then a parent node
  95. will be created to encapsulate the HTML in a single element. In
  96. this case, leading or trailing text is allowed.
  97. If `guess_charset` is true, the `chardet` library will perform charset
  98. guessing on the string.
  99. """
  100. if not isinstance(html, _strings):
  101. raise TypeError('string required')
  102. accept_leading_text = bool(create_parent)
  103. elements = fragments_fromstring(
  104. html, guess_charset=guess_charset, parser=parser,
  105. no_leading_text=not accept_leading_text)
  106. if create_parent:
  107. if not isinstance(create_parent, _strings):
  108. create_parent = 'div'
  109. new_root = Element(create_parent)
  110. if elements:
  111. if isinstance(elements[0], _strings):
  112. new_root.text = elements[0]
  113. del elements[0]
  114. new_root.extend(elements)
  115. return new_root
  116. if not elements:
  117. raise etree.ParserError('No elements found')
  118. if len(elements) > 1:
  119. raise etree.ParserError('Multiple elements found')
  120. result = elements[0]
  121. if result.tail and result.tail.strip():
  122. raise etree.ParserError('Element followed by text: %r' % result.tail)
  123. result.tail = None
  124. return result
  125. def fromstring(html, guess_charset=None, parser=None):
  126. """Parse the html, returning a single element/document.
  127. This tries to minimally parse the chunk of text, without knowing if it
  128. is a fragment or a document.
  129. 'base_url' will set the document's base_url attribute (and the tree's
  130. docinfo.URL)
  131. If `guess_charset` is true, or if the input is not Unicode but a
  132. byte string, the `chardet` library will perform charset guessing
  133. on the string.
  134. """
  135. if not isinstance(html, _strings):
  136. raise TypeError('string required')
  137. doc = document_fromstring(html, parser=parser,
  138. guess_charset=guess_charset)
  139. # document starts with doctype or <html>, full document!
  140. start = html[:50]
  141. if isinstance(start, bytes):
  142. # Allow text comparison in python3.
  143. # Decode as ascii, that also covers latin-1 and utf-8 for the
  144. # characters we need.
  145. start = start.decode('ascii', 'replace')
  146. start = start.lstrip().lower()
  147. if start.startswith('<html') or start.startswith('<!doctype'):
  148. return doc
  149. head = _find_tag(doc, 'head')
  150. # if the head is not empty we have a full document
  151. if len(head):
  152. return doc
  153. body = _find_tag(doc, 'body')
  154. # The body has just one element, so it was probably a single
  155. # element passed in
  156. if (len(body) == 1 and (not body.text or not body.text.strip())
  157. and (not body[-1].tail or not body[-1].tail.strip())):
  158. return body[0]
  159. # Now we have a body which represents a bunch of tags which have the
  160. # content that was passed in. We will create a fake container, which
  161. # is the body tag, except <body> implies too much structure.
  162. if _contains_block_level_tag(body):
  163. body.tag = 'div'
  164. else:
  165. body.tag = 'span'
  166. return body
  167. def parse(filename_url_or_file, guess_charset=None, parser=None):
  168. """Parse a filename, URL, or file-like object into an HTML document
  169. tree. Note: this returns a tree, not an element. Use
  170. ``parse(...).getroot()`` to get the document root.
  171. If ``guess_charset`` is true, the ``useChardet`` option is passed into
  172. html5lib to enable character detection. This option is on by default
  173. when parsing from URLs, off by default when parsing from file(-like)
  174. objects (which tend to return Unicode more often than not), and on by
  175. default when parsing from a file path (which is read in binary mode).
  176. """
  177. if parser is None:
  178. parser = html_parser
  179. if not isinstance(filename_url_or_file, _strings):
  180. fp = filename_url_or_file
  181. if guess_charset is None:
  182. # assume that file-like objects return Unicode more often than bytes
  183. guess_charset = False
  184. elif _looks_like_url(filename_url_or_file):
  185. fp = urlopen(filename_url_or_file)
  186. if guess_charset is None:
  187. # assume that URLs return bytes
  188. guess_charset = True
  189. else:
  190. fp = open(filename_url_or_file, 'rb')
  191. if guess_charset is None:
  192. guess_charset = True
  193. options = {}
  194. # html5lib does not accept useChardet as an argument, if it
  195. # detected the html argument would produce unicode objects.
  196. if guess_charset:
  197. options['useChardet'] = guess_charset
  198. return parser.parse(fp, **options)
  199. def _looks_like_url(str):
  200. scheme = urlparse(str)[0]
  201. if not scheme:
  202. return False
  203. elif (sys.platform == 'win32' and
  204. scheme in string.ascii_letters
  205. and len(scheme) == 1):
  206. # looks like a 'normal' absolute path
  207. return False
  208. else:
  209. return True
  210. html_parser = HTMLParser()