You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

885 lines
30 KiB

  1. # cython: language_level=3
  2. from __future__ import absolute_import
  3. import difflib
  4. from lxml import etree
  5. from lxml.html import fragment_fromstring
  6. import re
  7. __all__ = ['html_annotate', 'htmldiff']
  8. try:
  9. from html import escape as html_escape
  10. except ImportError:
  11. from cgi import escape as html_escape
  12. try:
  13. _unicode = unicode
  14. except NameError:
  15. # Python 3
  16. _unicode = str
  17. try:
  18. basestring
  19. except NameError:
  20. # Python 3
  21. basestring = str
  22. ############################################################
  23. ## Annotation
  24. ############################################################
  25. def default_markup(text, version):
  26. return '<span title="%s">%s</span>' % (
  27. html_escape(_unicode(version), 1), text)
  28. def html_annotate(doclist, markup=default_markup):
  29. """
  30. doclist should be ordered from oldest to newest, like::
  31. >>> version1 = 'Hello World'
  32. >>> version2 = 'Goodbye World'
  33. >>> print(html_annotate([(version1, 'version 1'),
  34. ... (version2, 'version 2')]))
  35. <span title="version 2">Goodbye</span> <span title="version 1">World</span>
  36. The documents must be *fragments* (str/UTF8 or unicode), not
  37. complete documents
  38. The markup argument is a function to markup the spans of words.
  39. This function is called like markup('Hello', 'version 2'), and
  40. returns HTML. The first argument is text and never includes any
  41. markup. The default uses a span with a title:
  42. >>> print(default_markup('Some Text', 'by Joe'))
  43. <span title="by Joe">Some Text</span>
  44. """
  45. # The basic strategy we have is to split the documents up into
  46. # logical tokens (which are words with attached markup). We then
  47. # do diffs of each of the versions to track when a token first
  48. # appeared in the document; the annotation attached to the token
  49. # is the version where it first appeared.
  50. tokenlist = [tokenize_annotated(doc, version)
  51. for doc, version in doclist]
  52. cur_tokens = tokenlist[0]
  53. for tokens in tokenlist[1:]:
  54. html_annotate_merge_annotations(cur_tokens, tokens)
  55. cur_tokens = tokens
  56. # After we've tracked all the tokens, we can combine spans of text
  57. # that are adjacent and have the same annotation
  58. cur_tokens = compress_tokens(cur_tokens)
  59. # And finally add markup
  60. result = markup_serialize_tokens(cur_tokens, markup)
  61. return ''.join(result).strip()
  62. def tokenize_annotated(doc, annotation):
  63. """Tokenize a document and add an annotation attribute to each token
  64. """
  65. tokens = tokenize(doc, include_hrefs=False)
  66. for tok in tokens:
  67. tok.annotation = annotation
  68. return tokens
  69. def html_annotate_merge_annotations(tokens_old, tokens_new):
  70. """Merge the annotations from tokens_old into tokens_new, when the
  71. tokens in the new document already existed in the old document.
  72. """
  73. s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
  74. commands = s.get_opcodes()
  75. for command, i1, i2, j1, j2 in commands:
  76. if command == 'equal':
  77. eq_old = tokens_old[i1:i2]
  78. eq_new = tokens_new[j1:j2]
  79. copy_annotations(eq_old, eq_new)
  80. def copy_annotations(src, dest):
  81. """
  82. Copy annotations from the tokens listed in src to the tokens in dest
  83. """
  84. assert len(src) == len(dest)
  85. for src_tok, dest_tok in zip(src, dest):
  86. dest_tok.annotation = src_tok.annotation
  87. def compress_tokens(tokens):
  88. """
  89. Combine adjacent tokens when there is no HTML between the tokens,
  90. and they share an annotation
  91. """
  92. result = [tokens[0]]
  93. for tok in tokens[1:]:
  94. if (not result[-1].post_tags and
  95. not tok.pre_tags and
  96. result[-1].annotation == tok.annotation):
  97. compress_merge_back(result, tok)
  98. else:
  99. result.append(tok)
  100. return result
  101. def compress_merge_back(tokens, tok):
  102. """ Merge tok into the last element of tokens (modifying the list of
  103. tokens in-place). """
  104. last = tokens[-1]
  105. if type(last) is not token or type(tok) is not token:
  106. tokens.append(tok)
  107. else:
  108. text = _unicode(last)
  109. if last.trailing_whitespace:
  110. text += last.trailing_whitespace
  111. text += tok
  112. merged = token(text,
  113. pre_tags=last.pre_tags,
  114. post_tags=tok.post_tags,
  115. trailing_whitespace=tok.trailing_whitespace)
  116. merged.annotation = last.annotation
  117. tokens[-1] = merged
  118. def markup_serialize_tokens(tokens, markup_func):
  119. """
  120. Serialize the list of tokens into a list of text chunks, calling
  121. markup_func around text to add annotations.
  122. """
  123. for token in tokens:
  124. for pre in token.pre_tags:
  125. yield pre
  126. html = token.html()
  127. html = markup_func(html, token.annotation)
  128. if token.trailing_whitespace:
  129. html += token.trailing_whitespace
  130. yield html
  131. for post in token.post_tags:
  132. yield post
  133. ############################################################
  134. ## HTML Diffs
  135. ############################################################
  136. def htmldiff(old_html, new_html):
  137. ## FIXME: this should take parsed documents too, and use their body
  138. ## or other content.
  139. """ Do a diff of the old and new document. The documents are HTML
  140. *fragments* (str/UTF8 or unicode), they are not complete documents
  141. (i.e., no <html> tag).
  142. Returns HTML with <ins> and <del> tags added around the
  143. appropriate text.
  144. Markup is generally ignored, with the markup from new_html
  145. preserved, and possibly some markup from old_html (though it is
  146. considered acceptable to lose some of the old markup). Only the
  147. words in the HTML are diffed. The exception is <img> tags, which
  148. are treated like words, and the href attribute of <a> tags, which
  149. are noted inside the tag itself when there are changes.
  150. """
  151. old_html_tokens = tokenize(old_html)
  152. new_html_tokens = tokenize(new_html)
  153. result = htmldiff_tokens(old_html_tokens, new_html_tokens)
  154. result = ''.join(result).strip()
  155. return fixup_ins_del_tags(result)
  156. def htmldiff_tokens(html1_tokens, html2_tokens):
  157. """ Does a diff on the tokens themselves, returning a list of text
  158. chunks (not tokens).
  159. """
  160. # There are several passes as we do the differences. The tokens
  161. # isolate the portion of the content we care to diff; difflib does
  162. # all the actual hard work at that point.
  163. #
  164. # Then we must create a valid document from pieces of both the old
  165. # document and the new document. We generally prefer to take
  166. # markup from the new document, and only do a best effort attempt
  167. # to keep markup from the old document; anything that we can't
  168. # resolve we throw away. Also we try to put the deletes as close
  169. # to the location where we think they would have been -- because
  170. # we are only keeping the markup from the new document, it can be
  171. # fuzzy where in the new document the old text would have gone.
  172. # Again we just do a best effort attempt.
  173. s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
  174. commands = s.get_opcodes()
  175. result = []
  176. for command, i1, i2, j1, j2 in commands:
  177. if command == 'equal':
  178. result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
  179. continue
  180. if command == 'insert' or command == 'replace':
  181. ins_tokens = expand_tokens(html2_tokens[j1:j2])
  182. merge_insert(ins_tokens, result)
  183. if command == 'delete' or command == 'replace':
  184. del_tokens = expand_tokens(html1_tokens[i1:i2])
  185. merge_delete(del_tokens, result)
  186. # If deletes were inserted directly as <del> then we'd have an
  187. # invalid document at this point. Instead we put in special
  188. # markers, and when the complete diffed document has been created
  189. # we try to move the deletes around and resolve any problems.
  190. result = cleanup_delete(result)
  191. return result
  192. def expand_tokens(tokens, equal=False):
  193. """Given a list of tokens, return a generator of the chunks of
  194. text for the data in the tokens.
  195. """
  196. for token in tokens:
  197. for pre in token.pre_tags:
  198. yield pre
  199. if not equal or not token.hide_when_equal:
  200. if token.trailing_whitespace:
  201. yield token.html() + token.trailing_whitespace
  202. else:
  203. yield token.html()
  204. for post in token.post_tags:
  205. yield post
  206. def merge_insert(ins_chunks, doc):
  207. """ doc is the already-handled document (as a list of text chunks);
  208. here we add <ins>ins_chunks</ins> to the end of that. """
  209. # Though we don't throw away unbalanced_start or unbalanced_end
  210. # (we assume there is accompanying markup later or earlier in the
  211. # document), we only put <ins> around the balanced portion.
  212. unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
  213. doc.extend(unbalanced_start)
  214. if doc and not doc[-1].endswith(' '):
  215. # Fix up the case where the word before the insert didn't end with
  216. # a space
  217. doc[-1] += ' '
  218. doc.append('<ins>')
  219. if balanced and balanced[-1].endswith(' '):
  220. # We move space outside of </ins>
  221. balanced[-1] = balanced[-1][:-1]
  222. doc.extend(balanced)
  223. doc.append('</ins> ')
  224. doc.extend(unbalanced_end)
  225. # These are sentinals to represent the start and end of a <del>
  226. # segment, until we do the cleanup phase to turn them into proper
  227. # markup:
  228. class DEL_START:
  229. pass
  230. class DEL_END:
  231. pass
  232. class NoDeletes(Exception):
  233. """ Raised when the document no longer contains any pending deletes
  234. (DEL_START/DEL_END) """
  235. def merge_delete(del_chunks, doc):
  236. """ Adds the text chunks in del_chunks to the document doc (another
  237. list of text chunks) with marker to show it is a delete.
  238. cleanup_delete later resolves these markers into <del> tags."""
  239. doc.append(DEL_START)
  240. doc.extend(del_chunks)
  241. doc.append(DEL_END)
  242. def cleanup_delete(chunks):
  243. """ Cleans up any DEL_START/DEL_END markers in the document, replacing
  244. them with <del></del>. To do this while keeping the document
  245. valid, it may need to drop some tags (either start or end tags).
  246. It may also move the del into adjacent tags to try to move it to a
  247. similar location where it was originally located (e.g., moving a
  248. delete into preceding <div> tag, if the del looks like (DEL_START,
  249. 'Text</div>', DEL_END)"""
  250. while 1:
  251. # Find a pending DEL_START/DEL_END, splitting the document
  252. # into stuff-preceding-DEL_START, stuff-inside, and
  253. # stuff-following-DEL_END
  254. try:
  255. pre_delete, delete, post_delete = split_delete(chunks)
  256. except NoDeletes:
  257. # Nothing found, we've cleaned up the entire doc
  258. break
  259. # The stuff-inside-DEL_START/END may not be well balanced
  260. # markup. First we figure out what unbalanced portions there are:
  261. unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
  262. # Then we move the span forward and/or backward based on these
  263. # unbalanced portions:
  264. locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
  265. locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
  266. doc = pre_delete
  267. if doc and not doc[-1].endswith(' '):
  268. # Fix up case where the word before us didn't have a trailing space
  269. doc[-1] += ' '
  270. doc.append('<del>')
  271. if balanced and balanced[-1].endswith(' '):
  272. # We move space outside of </del>
  273. balanced[-1] = balanced[-1][:-1]
  274. doc.extend(balanced)
  275. doc.append('</del> ')
  276. doc.extend(post_delete)
  277. chunks = doc
  278. return chunks
  279. def split_unbalanced(chunks):
  280. """Return (unbalanced_start, balanced, unbalanced_end), where each is
  281. a list of text and tag chunks.
  282. unbalanced_start is a list of all the tags that are opened, but
  283. not closed in this span. Similarly, unbalanced_end is a list of
  284. tags that are closed but were not opened. Extracting these might
  285. mean some reordering of the chunks."""
  286. start = []
  287. end = []
  288. tag_stack = []
  289. balanced = []
  290. for chunk in chunks:
  291. if not chunk.startswith('<'):
  292. balanced.append(chunk)
  293. continue
  294. endtag = chunk[1] == '/'
  295. name = chunk.split()[0].strip('<>/')
  296. if name in empty_tags:
  297. balanced.append(chunk)
  298. continue
  299. if endtag:
  300. if tag_stack and tag_stack[-1][0] == name:
  301. balanced.append(chunk)
  302. name, pos, tag = tag_stack.pop()
  303. balanced[pos] = tag
  304. elif tag_stack:
  305. start.extend([tag for name, pos, tag in tag_stack])
  306. tag_stack = []
  307. end.append(chunk)
  308. else:
  309. end.append(chunk)
  310. else:
  311. tag_stack.append((name, len(balanced), chunk))
  312. balanced.append(None)
  313. start.extend(
  314. [chunk for name, pos, chunk in tag_stack])
  315. balanced = [chunk for chunk in balanced if chunk is not None]
  316. return start, balanced, end
  317. def split_delete(chunks):
  318. """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
  319. stuff_after_DEL_END). Returns the first case found (there may be
  320. more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
  321. there's no DEL_START found. """
  322. try:
  323. pos = chunks.index(DEL_START)
  324. except ValueError:
  325. raise NoDeletes
  326. pos2 = chunks.index(DEL_END)
  327. return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
  328. def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
  329. """ pre_delete and post_delete implicitly point to a place in the
  330. document (where the two were split). This moves that point (by
  331. popping items from one and pushing them onto the other). It moves
  332. the point to try to find a place where unbalanced_start applies.
  333. As an example::
  334. >>> unbalanced_start = ['<div>']
  335. >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
  336. >>> pre, post = doc[:3], doc[3:]
  337. >>> pre, post
  338. (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
  339. >>> locate_unbalanced_start(unbalanced_start, pre, post)
  340. >>> pre, post
  341. (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
  342. As you can see, we moved the point so that the dangling <div> that
  343. we found will be effectively replaced by the div in the original
  344. document. If this doesn't work out, we just throw away
  345. unbalanced_start without doing anything.
  346. """
  347. while 1:
  348. if not unbalanced_start:
  349. # We have totally succeeded in finding the position
  350. break
  351. finding = unbalanced_start[0]
  352. finding_name = finding.split()[0].strip('<>')
  353. if not post_delete:
  354. break
  355. next = post_delete[0]
  356. if next is DEL_START or not next.startswith('<'):
  357. # Reached a word, we can't move the delete text forward
  358. break
  359. if next[1] == '/':
  360. # Reached a closing tag, can we go further? Maybe not...
  361. break
  362. name = next.split()[0].strip('<>')
  363. if name == 'ins':
  364. # Can't move into an insert
  365. break
  366. assert name != 'del', (
  367. "Unexpected delete tag: %r" % next)
  368. if name == finding_name:
  369. unbalanced_start.pop(0)
  370. pre_delete.append(post_delete.pop(0))
  371. else:
  372. # Found a tag that doesn't match
  373. break
  374. def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
  375. """ like locate_unbalanced_start, except handling end tags and
  376. possibly moving the point earlier in the document. """
  377. while 1:
  378. if not unbalanced_end:
  379. # Success
  380. break
  381. finding = unbalanced_end[-1]
  382. finding_name = finding.split()[0].strip('<>/')
  383. if not pre_delete:
  384. break
  385. next = pre_delete[-1]
  386. if next is DEL_END or not next.startswith('</'):
  387. # A word or a start tag
  388. break
  389. name = next.split()[0].strip('<>/')
  390. if name == 'ins' or name == 'del':
  391. # Can't move into an insert or delete
  392. break
  393. if name == finding_name:
  394. unbalanced_end.pop()
  395. post_delete.insert(0, pre_delete.pop())
  396. else:
  397. # Found a tag that doesn't match
  398. break
  399. class token(_unicode):
  400. """ Represents a diffable token, generally a word that is displayed to
  401. the user. Opening tags are attached to this token when they are
  402. adjacent (pre_tags) and closing tags that follow the word
  403. (post_tags). Some exceptions occur when there are empty tags
  404. adjacent to a word, so there may be close tags in pre_tags, or
  405. open tags in post_tags.
  406. We also keep track of whether the word was originally followed by
  407. whitespace, even though we do not want to treat the word as
  408. equivalent to a similar word that does not have a trailing
  409. space."""
  410. # When this is true, the token will be eliminated from the
  411. # displayed diff if no change has occurred:
  412. hide_when_equal = False
  413. def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
  414. obj = _unicode.__new__(cls, text)
  415. if pre_tags is not None:
  416. obj.pre_tags = pre_tags
  417. else:
  418. obj.pre_tags = []
  419. if post_tags is not None:
  420. obj.post_tags = post_tags
  421. else:
  422. obj.post_tags = []
  423. obj.trailing_whitespace = trailing_whitespace
  424. return obj
  425. def __repr__(self):
  426. return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
  427. self.post_tags, self.trailing_whitespace)
  428. def html(self):
  429. return _unicode(self)
  430. class tag_token(token):
  431. """ Represents a token that is actually a tag. Currently this is just
  432. the <img> tag, which takes up visible space just like a word but
  433. is only represented in a document by a tag. """
  434. def __new__(cls, tag, data, html_repr, pre_tags=None,
  435. post_tags=None, trailing_whitespace=""):
  436. obj = token.__new__(cls, "%s: %s" % (type, data),
  437. pre_tags=pre_tags,
  438. post_tags=post_tags,
  439. trailing_whitespace=trailing_whitespace)
  440. obj.tag = tag
  441. obj.data = data
  442. obj.html_repr = html_repr
  443. return obj
  444. def __repr__(self):
  445. return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
  446. self.tag,
  447. self.data,
  448. self.html_repr,
  449. self.pre_tags,
  450. self.post_tags,
  451. self.trailing_whitespace)
  452. def html(self):
  453. return self.html_repr
  454. class href_token(token):
  455. """ Represents the href in an anchor tag. Unlike other words, we only
  456. show the href when it changes. """
  457. hide_when_equal = True
  458. def html(self):
  459. return ' Link: %s' % self
  460. def tokenize(html, include_hrefs=True):
  461. """
  462. Parse the given HTML and returns token objects (words with attached tags).
  463. This parses only the content of a page; anything in the head is
  464. ignored, and the <head> and <body> elements are themselves
  465. optional. The content is then parsed by lxml, which ensures the
  466. validity of the resulting parsed document (though lxml may make
  467. incorrect guesses when the markup is particular bad).
  468. <ins> and <del> tags are also eliminated from the document, as
  469. that gets confusing.
  470. If include_hrefs is true, then the href attribute of <a> tags is
  471. included as a special kind of diffable token."""
  472. if etree.iselement(html):
  473. body_el = html
  474. else:
  475. body_el = parse_html(html, cleanup=True)
  476. # Then we split the document into text chunks for each tag, word, and end tag:
  477. chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
  478. # Finally re-joining them into token objects:
  479. return fixup_chunks(chunks)
  480. def parse_html(html, cleanup=True):
  481. """
  482. Parses an HTML fragment, returning an lxml element. Note that the HTML will be
  483. wrapped in a <div> tag that was not in the original document.
  484. If cleanup is true, make sure there's no <head> or <body>, and get
  485. rid of any <ins> and <del> tags.
  486. """
  487. if cleanup:
  488. # This removes any extra markup or structure like <head>:
  489. html = cleanup_html(html)
  490. return fragment_fromstring(html, create_parent=True)
  491. _body_re = re.compile(r'<body.*?>', re.I|re.S)
  492. _end_body_re = re.compile(r'</body.*?>', re.I|re.S)
  493. _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
  494. def cleanup_html(html):
  495. """ This 'cleans' the HTML, meaning that any page structure is removed
  496. (only the contents of <body> are used, if there is any <body).
  497. Also <ins> and <del> tags are removed. """
  498. match = _body_re.search(html)
  499. if match:
  500. html = html[match.end():]
  501. match = _end_body_re.search(html)
  502. if match:
  503. html = html[:match.start()]
  504. html = _ins_del_re.sub('', html)
  505. return html
  506. end_whitespace_re = re.compile(r'[ \t\n\r]$')
  507. def split_trailing_whitespace(word):
  508. """
  509. This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
  510. """
  511. stripped_length = len(word.rstrip())
  512. return word[0:stripped_length], word[stripped_length:]
  513. def fixup_chunks(chunks):
  514. """
  515. This function takes a list of chunks and produces a list of tokens.
  516. """
  517. tag_accum = []
  518. cur_word = None
  519. result = []
  520. for chunk in chunks:
  521. if isinstance(chunk, tuple):
  522. if chunk[0] == 'img':
  523. src = chunk[1]
  524. tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
  525. cur_word = tag_token('img', src, html_repr=tag,
  526. pre_tags=tag_accum,
  527. trailing_whitespace=trailing_whitespace)
  528. tag_accum = []
  529. result.append(cur_word)
  530. elif chunk[0] == 'href':
  531. href = chunk[1]
  532. cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
  533. tag_accum = []
  534. result.append(cur_word)
  535. continue
  536. if is_word(chunk):
  537. chunk, trailing_whitespace = split_trailing_whitespace(chunk)
  538. cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
  539. tag_accum = []
  540. result.append(cur_word)
  541. elif is_start_tag(chunk):
  542. tag_accum.append(chunk)
  543. elif is_end_tag(chunk):
  544. if tag_accum:
  545. tag_accum.append(chunk)
  546. else:
  547. assert cur_word, (
  548. "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
  549. % (cur_word, result, chunk, chunks))
  550. cur_word.post_tags.append(chunk)
  551. else:
  552. assert False
  553. if not result:
  554. return [token('', pre_tags=tag_accum)]
  555. else:
  556. result[-1].post_tags.extend(tag_accum)
  557. return result
  558. # All the tags in HTML that don't require end tags:
  559. empty_tags = (
  560. 'param', 'img', 'area', 'br', 'basefont', 'input',
  561. 'base', 'meta', 'link', 'col')
  562. block_level_tags = (
  563. 'address',
  564. 'blockquote',
  565. 'center',
  566. 'dir',
  567. 'div',
  568. 'dl',
  569. 'fieldset',
  570. 'form',
  571. 'h1',
  572. 'h2',
  573. 'h3',
  574. 'h4',
  575. 'h5',
  576. 'h6',
  577. 'hr',
  578. 'isindex',
  579. 'menu',
  580. 'noframes',
  581. 'noscript',
  582. 'ol',
  583. 'p',
  584. 'pre',
  585. 'table',
  586. 'ul',
  587. )
  588. block_level_container_tags = (
  589. 'dd',
  590. 'dt',
  591. 'frameset',
  592. 'li',
  593. 'tbody',
  594. 'td',
  595. 'tfoot',
  596. 'th',
  597. 'thead',
  598. 'tr',
  599. )
  600. def flatten_el(el, include_hrefs, skip_tag=False):
  601. """ Takes an lxml element el, and generates all the text chunks for
  602. that tag. Each start tag is a chunk, each word is a chunk, and each
  603. end tag is a chunk.
  604. If skip_tag is true, then the outermost container tag is
  605. not returned (just its contents)."""
  606. if not skip_tag:
  607. if el.tag == 'img':
  608. yield ('img', el.get('src'), start_tag(el))
  609. else:
  610. yield start_tag(el)
  611. if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
  612. return
  613. start_words = split_words(el.text)
  614. for word in start_words:
  615. yield html_escape(word)
  616. for child in el:
  617. for item in flatten_el(child, include_hrefs=include_hrefs):
  618. yield item
  619. if el.tag == 'a' and el.get('href') and include_hrefs:
  620. yield ('href', el.get('href'))
  621. if not skip_tag:
  622. yield end_tag(el)
  623. end_words = split_words(el.tail)
  624. for word in end_words:
  625. yield html_escape(word)
  626. split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
  627. def split_words(text):
  628. """ Splits some text into words. Includes trailing whitespace
  629. on each word when appropriate. """
  630. if not text or not text.strip():
  631. return []
  632. words = split_words_re.findall(text)
  633. return words
  634. start_whitespace_re = re.compile(r'^[ \t\n\r]')
  635. def start_tag(el):
  636. """
  637. The text representation of the start tag for a tag.
  638. """
  639. return '<%s%s>' % (
  640. el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
  641. for name, value in el.attrib.items()]))
  642. def end_tag(el):
  643. """ The text representation of an end tag for a tag. Includes
  644. trailing whitespace when appropriate. """
  645. if el.tail and start_whitespace_re.search(el.tail):
  646. extra = ' '
  647. else:
  648. extra = ''
  649. return '</%s>%s' % (el.tag, extra)
  650. def is_word(tok):
  651. return not tok.startswith('<')
  652. def is_end_tag(tok):
  653. return tok.startswith('</')
  654. def is_start_tag(tok):
  655. return tok.startswith('<') and not tok.startswith('</')
  656. def fixup_ins_del_tags(html):
  657. """ Given an html string, move any <ins> or <del> tags inside of any
  658. block-level elements, e.g. transform <ins><p>word</p></ins> to
  659. <p><ins>word</ins></p> """
  660. doc = parse_html(html, cleanup=False)
  661. _fixup_ins_del_tags(doc)
  662. html = serialize_html_fragment(doc, skip_outer=True)
  663. return html
  664. def serialize_html_fragment(el, skip_outer=False):
  665. """ Serialize a single lxml element as HTML. The serialized form
  666. includes the elements tail.
  667. If skip_outer is true, then don't serialize the outermost tag
  668. """
  669. assert not isinstance(el, basestring), (
  670. "You should pass in an element, not a string like %r" % el)
  671. html = etree.tostring(el, method="html", encoding=_unicode)
  672. if skip_outer:
  673. # Get rid of the extra starting tag:
  674. html = html[html.find('>')+1:]
  675. # Get rid of the extra end tag:
  676. html = html[:html.rfind('<')]
  677. return html.strip()
  678. else:
  679. return html
  680. def _fixup_ins_del_tags(doc):
  681. """fixup_ins_del_tags that works on an lxml document in-place
  682. """
  683. for tag in ['ins', 'del']:
  684. for el in doc.xpath('descendant-or-self::%s' % tag):
  685. if not _contains_block_level_tag(el):
  686. continue
  687. _move_el_inside_block(el, tag=tag)
  688. el.drop_tag()
  689. #_merge_element_contents(el)
  690. def _contains_block_level_tag(el):
  691. """True if the element contains any block-level elements, like <p>, <td>, etc.
  692. """
  693. if el.tag in block_level_tags or el.tag in block_level_container_tags:
  694. return True
  695. for child in el:
  696. if _contains_block_level_tag(child):
  697. return True
  698. return False
  699. def _move_el_inside_block(el, tag):
  700. """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
  701. and moves them inside any block-level tags. """
  702. for child in el:
  703. if _contains_block_level_tag(child):
  704. break
  705. else:
  706. # No block-level tags in any child
  707. children_tag = etree.Element(tag)
  708. children_tag.text = el.text
  709. el.text = None
  710. children_tag.extend(list(el))
  711. el[:] = [children_tag]
  712. return
  713. for child in list(el):
  714. if _contains_block_level_tag(child):
  715. _move_el_inside_block(child, tag)
  716. if child.tail:
  717. tail_tag = etree.Element(tag)
  718. tail_tag.text = child.tail
  719. child.tail = None
  720. el.insert(el.index(child)+1, tail_tag)
  721. else:
  722. child_tag = etree.Element(tag)
  723. el.replace(child, child_tag)
  724. child_tag.append(child)
  725. if el.text:
  726. text_tag = etree.Element(tag)
  727. text_tag.text = el.text
  728. el.text = None
  729. el.insert(0, text_tag)
  730. def _merge_element_contents(el):
  731. """
  732. Removes an element, but merges its contents into its place, e.g.,
  733. given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
  734. <p>Hi there!</p>
  735. """
  736. parent = el.getparent()
  737. text = el.text or ''
  738. if el.tail:
  739. if not len(el):
  740. text += el.tail
  741. else:
  742. if el[-1].tail:
  743. el[-1].tail += el.tail
  744. else:
  745. el[-1].tail = el.tail
  746. index = parent.index(el)
  747. if text:
  748. if index == 0:
  749. previous = None
  750. else:
  751. previous = parent[index-1]
  752. if previous is None:
  753. if parent.text:
  754. parent.text += text
  755. else:
  756. parent.text = text
  757. else:
  758. if previous.tail:
  759. previous.tail += text
  760. else:
  761. previous.tail = text
  762. parent[index:index+1] = el.getchildren()
  763. class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
  764. """
  765. Acts like SequenceMatcher, but tries not to find very small equal
  766. blocks amidst large spans of changes
  767. """
  768. threshold = 2
  769. def get_matching_blocks(self):
  770. size = min(len(self.b), len(self.b))
  771. threshold = min(self.threshold, size / 4)
  772. actual = difflib.SequenceMatcher.get_matching_blocks(self)
  773. return [item for item in actual
  774. if item[2] > threshold
  775. or not item[2]]
  776. if __name__ == '__main__':
  777. from lxml.html import _diffcommand
  778. _diffcommand.main()