Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 
 
 

489 Zeilen
17 KiB

  1. """
  2. lxml-based doctest output comparison.
  3. Note: normally, you should just import the `lxml.usedoctest` and
  4. `lxml.html.usedoctest` modules from within a doctest, instead of this
  5. one::
  6. >>> import lxml.usedoctest # for XML output
  7. >>> import lxml.html.usedoctest # for HTML output
  8. To use this module directly, you must call ``lxmldoctest.install()``,
  9. which will cause doctest to use this in all subsequent calls.
  10. This changes the way output is checked and comparisons are made for
  11. XML or HTML-like content.
  12. XML or HTML content is noticed because the example starts with ``<``
  13. (it's HTML if it starts with ``<html``). You can also use the
  14. ``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing.
  15. Some rough wildcard-like things are allowed. Whitespace is generally
  16. ignored (except in attributes). In text (attributes and text in the
  17. body) you can use ``...`` as a wildcard. In an example it also
  18. matches any trailing tags in the element, though it does not match
  19. leading tags. You may create a tag ``<any>`` or include an ``any``
  20. attribute in the tag. An ``any`` tag matches any tag, while the
  21. attribute matches any and all attributes.
  22. When a match fails, the reformatted example and gotten text is
  23. displayed (indented), and a rough diff-like output is given. Anything
  24. marked with ``+`` is in the output but wasn't supposed to be, and
  25. similarly ``-`` means its in the example but wasn't in the output.
  26. You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP``
  27. """
  28. from lxml import etree
  29. import sys
  30. import re
  31. import doctest
  32. try:
  33. from html import escape as html_escape
  34. except ImportError:
  35. from cgi import escape as html_escape
  36. __all__ = ['PARSE_HTML', 'PARSE_XML', 'NOPARSE_MARKUP', 'LXMLOutputChecker',
  37. 'LHTMLOutputChecker', 'install', 'temp_install']
  38. PARSE_HTML = doctest.register_optionflag('PARSE_HTML')
  39. PARSE_XML = doctest.register_optionflag('PARSE_XML')
  40. NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP')
  41. OutputChecker = doctest.OutputChecker
  42. def strip(v):
  43. if v is None:
  44. return None
  45. else:
  46. return v.strip()
  47. def norm_whitespace(v):
  48. return _norm_whitespace_re.sub(' ', v)
  49. _html_parser = etree.HTMLParser(recover=False, remove_blank_text=True)
  50. def html_fromstring(html):
  51. return etree.fromstring(html, _html_parser)
  52. # We use this to distinguish repr()s from elements:
  53. _repr_re = re.compile(r'^<[^>]+ (at|object) ')
  54. _norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+')
  55. class LXMLOutputChecker(OutputChecker):
  56. empty_tags = (
  57. 'param', 'img', 'area', 'br', 'basefont', 'input',
  58. 'base', 'meta', 'link', 'col')
  59. def get_default_parser(self):
  60. return etree.XML
  61. def check_output(self, want, got, optionflags):
  62. alt_self = getattr(self, '_temp_override_self', None)
  63. if alt_self is not None:
  64. super_method = self._temp_call_super_check_output
  65. self = alt_self
  66. else:
  67. super_method = OutputChecker.check_output
  68. parser = self.get_parser(want, got, optionflags)
  69. if not parser:
  70. return super_method(
  71. self, want, got, optionflags)
  72. try:
  73. want_doc = parser(want)
  74. except etree.XMLSyntaxError:
  75. return False
  76. try:
  77. got_doc = parser(got)
  78. except etree.XMLSyntaxError:
  79. return False
  80. return self.compare_docs(want_doc, got_doc)
  81. def get_parser(self, want, got, optionflags):
  82. parser = None
  83. if NOPARSE_MARKUP & optionflags:
  84. return None
  85. if PARSE_HTML & optionflags:
  86. parser = html_fromstring
  87. elif PARSE_XML & optionflags:
  88. parser = etree.XML
  89. elif (want.strip().lower().startswith('<html')
  90. and got.strip().startswith('<html')):
  91. parser = html_fromstring
  92. elif (self._looks_like_markup(want)
  93. and self._looks_like_markup(got)):
  94. parser = self.get_default_parser()
  95. return parser
  96. def _looks_like_markup(self, s):
  97. s = s.strip()
  98. return (s.startswith('<')
  99. and not _repr_re.search(s))
  100. def compare_docs(self, want, got):
  101. if not self.tag_compare(want.tag, got.tag):
  102. return False
  103. if not self.text_compare(want.text, got.text, True):
  104. return False
  105. if not self.text_compare(want.tail, got.tail, True):
  106. return False
  107. if 'any' not in want.attrib:
  108. want_keys = sorted(want.attrib.keys())
  109. got_keys = sorted(got.attrib.keys())
  110. if want_keys != got_keys:
  111. return False
  112. for key in want_keys:
  113. if not self.text_compare(want.attrib[key], got.attrib[key], False):
  114. return False
  115. if want.text != '...' or len(want):
  116. want_children = list(want)
  117. got_children = list(got)
  118. while want_children or got_children:
  119. if not want_children or not got_children:
  120. return False
  121. want_first = want_children.pop(0)
  122. got_first = got_children.pop(0)
  123. if not self.compare_docs(want_first, got_first):
  124. return False
  125. if not got_children and want_first.tail == '...':
  126. break
  127. return True
  128. def text_compare(self, want, got, strip):
  129. want = want or ''
  130. got = got or ''
  131. if strip:
  132. want = norm_whitespace(want).strip()
  133. got = norm_whitespace(got).strip()
  134. want = '^%s$' % re.escape(want)
  135. want = want.replace(r'\.\.\.', '.*')
  136. if re.search(want, got):
  137. return True
  138. else:
  139. return False
  140. def tag_compare(self, want, got):
  141. if want == 'any':
  142. return True
  143. if (not isinstance(want, (str, bytes))
  144. or not isinstance(got, (str, bytes))):
  145. return want == got
  146. want = want or ''
  147. got = got or ''
  148. if want.startswith('{...}'):
  149. # Ellipsis on the namespace
  150. return want.split('}')[-1] == got.split('}')[-1]
  151. else:
  152. return want == got
  153. def output_difference(self, example, got, optionflags):
  154. want = example.want
  155. parser = self.get_parser(want, got, optionflags)
  156. errors = []
  157. if parser is not None:
  158. try:
  159. want_doc = parser(want)
  160. except etree.XMLSyntaxError:
  161. e = sys.exc_info()[1]
  162. errors.append('In example: %s' % e)
  163. try:
  164. got_doc = parser(got)
  165. except etree.XMLSyntaxError:
  166. e = sys.exc_info()[1]
  167. errors.append('In actual output: %s' % e)
  168. if parser is None or errors:
  169. value = OutputChecker.output_difference(
  170. self, example, got, optionflags)
  171. if errors:
  172. errors.append(value)
  173. return '\n'.join(errors)
  174. else:
  175. return value
  176. html = parser is html_fromstring
  177. diff_parts = ['Expected:',
  178. self.format_doc(want_doc, html, 2),
  179. 'Got:',
  180. self.format_doc(got_doc, html, 2),
  181. 'Diff:',
  182. self.collect_diff(want_doc, got_doc, html, 2)]
  183. return '\n'.join(diff_parts)
  184. def html_empty_tag(self, el, html=True):
  185. if not html:
  186. return False
  187. if el.tag not in self.empty_tags:
  188. return False
  189. if el.text or len(el):
  190. # This shouldn't happen (contents in an empty tag)
  191. return False
  192. return True
  193. def format_doc(self, doc, html, indent, prefix=''):
  194. parts = []
  195. if not len(doc):
  196. # No children...
  197. parts.append(' '*indent)
  198. parts.append(prefix)
  199. parts.append(self.format_tag(doc))
  200. if not self.html_empty_tag(doc, html):
  201. if strip(doc.text):
  202. parts.append(self.format_text(doc.text))
  203. parts.append(self.format_end_tag(doc))
  204. if strip(doc.tail):
  205. parts.append(self.format_text(doc.tail))
  206. parts.append('\n')
  207. return ''.join(parts)
  208. parts.append(' '*indent)
  209. parts.append(prefix)
  210. parts.append(self.format_tag(doc))
  211. if not self.html_empty_tag(doc, html):
  212. parts.append('\n')
  213. if strip(doc.text):
  214. parts.append(' '*indent)
  215. parts.append(self.format_text(doc.text))
  216. parts.append('\n')
  217. for el in doc:
  218. parts.append(self.format_doc(el, html, indent+2))
  219. parts.append(' '*indent)
  220. parts.append(self.format_end_tag(doc))
  221. parts.append('\n')
  222. if strip(doc.tail):
  223. parts.append(' '*indent)
  224. parts.append(self.format_text(doc.tail))
  225. parts.append('\n')
  226. return ''.join(parts)
  227. def format_text(self, text, strip=True):
  228. if text is None:
  229. return ''
  230. if strip:
  231. text = text.strip()
  232. return html_escape(text, 1)
  233. def format_tag(self, el):
  234. attrs = []
  235. if isinstance(el, etree.CommentBase):
  236. # FIXME: probably PIs should be handled specially too?
  237. return '<!--'
  238. for name, value in sorted(el.attrib.items()):
  239. attrs.append('%s="%s"' % (name, self.format_text(value, False)))
  240. if not attrs:
  241. return '<%s>' % el.tag
  242. return '<%s %s>' % (el.tag, ' '.join(attrs))
  243. def format_end_tag(self, el):
  244. if isinstance(el, etree.CommentBase):
  245. # FIXME: probably PIs should be handled specially too?
  246. return '-->'
  247. return '</%s>' % el.tag
  248. def collect_diff(self, want, got, html, indent):
  249. parts = []
  250. if not len(want) and not len(got):
  251. parts.append(' '*indent)
  252. parts.append(self.collect_diff_tag(want, got))
  253. if not self.html_empty_tag(got, html):
  254. parts.append(self.collect_diff_text(want.text, got.text))
  255. parts.append(self.collect_diff_end_tag(want, got))
  256. parts.append(self.collect_diff_text(want.tail, got.tail))
  257. parts.append('\n')
  258. return ''.join(parts)
  259. parts.append(' '*indent)
  260. parts.append(self.collect_diff_tag(want, got))
  261. parts.append('\n')
  262. if strip(want.text) or strip(got.text):
  263. parts.append(' '*indent)
  264. parts.append(self.collect_diff_text(want.text, got.text))
  265. parts.append('\n')
  266. want_children = list(want)
  267. got_children = list(got)
  268. while want_children or got_children:
  269. if not want_children:
  270. parts.append(self.format_doc(got_children.pop(0), html, indent+2, '+'))
  271. continue
  272. if not got_children:
  273. parts.append(self.format_doc(want_children.pop(0), html, indent+2, '-'))
  274. continue
  275. parts.append(self.collect_diff(
  276. want_children.pop(0), got_children.pop(0), html, indent+2))
  277. parts.append(' '*indent)
  278. parts.append(self.collect_diff_end_tag(want, got))
  279. parts.append('\n')
  280. if strip(want.tail) or strip(got.tail):
  281. parts.append(' '*indent)
  282. parts.append(self.collect_diff_text(want.tail, got.tail))
  283. parts.append('\n')
  284. return ''.join(parts)
  285. def collect_diff_tag(self, want, got):
  286. if not self.tag_compare(want.tag, got.tag):
  287. tag = '%s (got: %s)' % (want.tag, got.tag)
  288. else:
  289. tag = got.tag
  290. attrs = []
  291. any = want.tag == 'any' or 'any' in want.attrib
  292. for name, value in sorted(got.attrib.items()):
  293. if name not in want.attrib and not any:
  294. attrs.append('+%s="%s"' % (name, self.format_text(value, False)))
  295. else:
  296. if name in want.attrib:
  297. text = self.collect_diff_text(want.attrib[name], value, False)
  298. else:
  299. text = self.format_text(value, False)
  300. attrs.append('%s="%s"' % (name, text))
  301. if not any:
  302. for name, value in sorted(want.attrib.items()):
  303. if name in got.attrib:
  304. continue
  305. attrs.append('-%s="%s"' % (name, self.format_text(value, False)))
  306. if attrs:
  307. tag = '<%s %s>' % (tag, ' '.join(attrs))
  308. else:
  309. tag = '<%s>' % tag
  310. return tag
  311. def collect_diff_end_tag(self, want, got):
  312. if want.tag != got.tag:
  313. tag = '%s (got: %s)' % (want.tag, got.tag)
  314. else:
  315. tag = got.tag
  316. return '</%s>' % tag
  317. def collect_diff_text(self, want, got, strip=True):
  318. if self.text_compare(want, got, strip):
  319. if not got:
  320. return ''
  321. return self.format_text(got, strip)
  322. text = '%s (got: %s)' % (want, got)
  323. return self.format_text(text, strip)
  324. class LHTMLOutputChecker(LXMLOutputChecker):
  325. def get_default_parser(self):
  326. return html_fromstring
  327. def install(html=False):
  328. """
  329. Install doctestcompare for all future doctests.
  330. If html is true, then by default the HTML parser will be used;
  331. otherwise the XML parser is used.
  332. """
  333. if html:
  334. doctest.OutputChecker = LHTMLOutputChecker
  335. else:
  336. doctest.OutputChecker = LXMLOutputChecker
  337. def temp_install(html=False, del_module=None):
  338. """
  339. Use this *inside* a doctest to enable this checker for this
  340. doctest only.
  341. If html is true, then by default the HTML parser will be used;
  342. otherwise the XML parser is used.
  343. """
  344. if html:
  345. Checker = LHTMLOutputChecker
  346. else:
  347. Checker = LXMLOutputChecker
  348. frame = _find_doctest_frame()
  349. dt_self = frame.f_locals['self']
  350. checker = Checker()
  351. old_checker = dt_self._checker
  352. dt_self._checker = checker
  353. # The unfortunate thing is that there is a local variable 'check'
  354. # in the function that runs the doctests, that is a bound method
  355. # into the output checker. We have to update that. We can't
  356. # modify the frame, so we have to modify the object in place. The
  357. # only way to do this is to actually change the func_code
  358. # attribute of the method. We change it, and then wait for
  359. # __record_outcome to be run, which signals the end of the __run
  360. # method, at which point we restore the previous check_output
  361. # implementation.
  362. check_func = frame.f_locals['check'].__func__
  363. checker_check_func = checker.check_output.__func__
  364. # Because we can't patch up func_globals, this is the only global
  365. # in check_output that we care about:
  366. doctest.etree = etree
  367. _RestoreChecker(dt_self, old_checker, checker,
  368. check_func, checker_check_func,
  369. del_module)
  370. class _RestoreChecker:
  371. def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func,
  372. del_module):
  373. self.dt_self = dt_self
  374. self.checker = old_checker
  375. self.checker._temp_call_super_check_output = self.call_super
  376. self.checker._temp_override_self = new_checker
  377. self.check_func = check_func
  378. self.clone_func = clone_func
  379. self.del_module = del_module
  380. self.install_clone()
  381. self.install_dt_self()
  382. def install_clone(self):
  383. self.func_code = self.check_func.__code__
  384. self.func_globals = self.check_func.__globals__
  385. self.check_func.__code__ = self.clone_func.__code__
  386. def uninstall_clone(self):
  387. self.check_func.__code__ = self.func_code
  388. def install_dt_self(self):
  389. self.prev_func = self.dt_self._DocTestRunner__record_outcome
  390. self.dt_self._DocTestRunner__record_outcome = self
  391. def uninstall_dt_self(self):
  392. self.dt_self._DocTestRunner__record_outcome = self.prev_func
  393. def uninstall_module(self):
  394. if self.del_module:
  395. import sys
  396. del sys.modules[self.del_module]
  397. if '.' in self.del_module:
  398. package, module = self.del_module.rsplit('.', 1)
  399. package_mod = sys.modules[package]
  400. delattr(package_mod, module)
  401. def __call__(self, *args, **kw):
  402. self.uninstall_clone()
  403. self.uninstall_dt_self()
  404. del self.checker._temp_override_self
  405. del self.checker._temp_call_super_check_output
  406. result = self.prev_func(*args, **kw)
  407. self.uninstall_module()
  408. return result
  409. def call_super(self, *args, **kw):
  410. self.uninstall_clone()
  411. try:
  412. return self.check_func(*args, **kw)
  413. finally:
  414. self.install_clone()
  415. def _find_doctest_frame():
  416. import sys
  417. frame = sys._getframe(1)
  418. while frame:
  419. l = frame.f_locals
  420. if 'BOOM' in l:
  421. # Sign of doctest
  422. return frame
  423. frame = frame.f_back
  424. raise LookupError(
  425. "Could not find doctest (only use this function *inside* a doctest)")
  426. __test__ = {
  427. 'basic': '''
  428. >>> temp_install()
  429. >>> print """<xml a="1" b="2">stuff</xml>"""
  430. <xml b="2" a="1">...</xml>
  431. >>> print """<xml xmlns="http://example.com"><tag attr="bar" /></xml>"""
  432. <xml xmlns="...">
  433. <tag attr="..." />
  434. </xml>
  435. >>> print """<xml>blahblahblah<foo /></xml>""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS
  436. <xml>...foo /></xml>
  437. '''}
  438. if __name__ == '__main__':
  439. import doctest
  440. doctest.testmod()