Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.
 
 
 
 

973 wiersze
32 KiB

  1. # cython: language_level=3
  2. try:
  3. import cython
  4. except ImportError:
  5. class fake_cython:
  6. compiled = False
  7. def cfunc(self, func): return func
  8. def cclass(self, func): return func
  9. def declare(self, _, value): return value
  10. def __getattr__(self, type_name): return "object"
  11. cython = fake_cython()
  12. try:
  13. from . import _difflib as difflib
  14. import inspect
  15. if inspect.isfunction(difflib.get_close_matches):
  16. raise ImportError(
  17. "Embedded difflib is not compiled to a fast binary, using the stdlib instead.")
  18. from cython.cimports.lxml.html._difflib import SequenceMatcher
  19. except ImportError:
  20. import difflib
  21. if not cython.compiled:
  22. from difflib import SequenceMatcher
  23. import itertools
  24. import functools
  25. import operator
  26. import re
  27. from lxml import etree
  28. from lxml.html import fragment_fromstring
  29. from . import defs
  30. __all__ = ['html_annotate', 'htmldiff']
  31. group_by_first_item = functools.partial(itertools.groupby, key=operator.itemgetter(0))
  32. ############################################################
  33. ## Annotation
  34. ############################################################
  35. @cython.cfunc
  36. def html_escape(text: str, _escapes: tuple = ('&', '<', '>', '"', ''')) -> str:
  37. # Not so slow compiled version of 'html.escape()'.
  38. # Most of the time, we replace little to nothing, so use a fast decision what needs to be done.
  39. ch: cython.Py_UCS4
  40. replace: cython.char[5] = [False] * 5
  41. for ch in text:
  42. replace[0] |= ch == '&'
  43. replace[1] |= ch == '<'
  44. replace[2] |= ch == '>'
  45. replace[3] |= ch == '"'
  46. replace[4] |= ch == "'"
  47. for i in range(5):
  48. if replace[i]:
  49. text = text.replace('&<>"\''[i], _escapes[i])
  50. return text
  51. if not cython.compiled:
  52. from html import escape as html_escape
  53. def default_markup(text, version):
  54. return '<span title="%s">%s</span>' % (
  55. html_escape(version), text)
  56. def html_annotate(doclist, markup=default_markup):
  57. """
  58. doclist should be ordered from oldest to newest, like::
  59. >>> version1 = 'Hello World'
  60. >>> version2 = 'Goodbye World'
  61. >>> print(html_annotate([(version1, 'version 1'),
  62. ... (version2, 'version 2')]))
  63. <span title="version 2">Goodbye</span> <span title="version 1">World</span>
  64. The documents must be *fragments* (str/UTF8 or unicode), not
  65. complete documents
  66. The markup argument is a function to markup the spans of words.
  67. This function is called like markup('Hello', 'version 2'), and
  68. returns HTML. The first argument is text and never includes any
  69. markup. The default uses a span with a title:
  70. >>> print(default_markup('Some Text', 'by Joe'))
  71. <span title="by Joe">Some Text</span>
  72. """
  73. # The basic strategy we have is to split the documents up into
  74. # logical tokens (which are words with attached markup). We then
  75. # do diffs of each of the versions to track when a token first
  76. # appeared in the document; the annotation attached to the token
  77. # is the version where it first appeared.
  78. tokenlist = [tokenize_annotated(doc, version)
  79. for doc, version in doclist]
  80. cur_tokens = tokenlist[0]
  81. for tokens in tokenlist[1:]:
  82. html_annotate_merge_annotations(cur_tokens, tokens)
  83. cur_tokens = tokens
  84. # After we've tracked all the tokens, we can combine spans of text
  85. # that are adjacent and have the same annotation
  86. cur_tokens = compress_tokens(cur_tokens)
  87. # And finally add markup
  88. result = markup_serialize_tokens(cur_tokens, markup)
  89. return ''.join(result).strip()
  90. def tokenize_annotated(doc, annotation):
  91. """Tokenize a document and add an annotation attribute to each token
  92. """
  93. tokens = tokenize(doc, include_hrefs=False)
  94. for tok in tokens:
  95. tok.annotation = annotation
  96. return tokens
  97. def html_annotate_merge_annotations(tokens_old, tokens_new):
  98. """Merge the annotations from tokens_old into tokens_new, when the
  99. tokens in the new document already existed in the old document.
  100. """
  101. s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
  102. commands = s.get_opcodes()
  103. for command, i1, i2, j1, j2 in commands:
  104. if command == 'equal':
  105. eq_old = tokens_old[i1:i2]
  106. eq_new = tokens_new[j1:j2]
  107. copy_annotations(eq_old, eq_new)
  108. def copy_annotations(src, dest):
  109. """
  110. Copy annotations from the tokens listed in src to the tokens in dest
  111. """
  112. assert len(src) == len(dest)
  113. for src_tok, dest_tok in zip(src, dest):
  114. dest_tok.annotation = src_tok.annotation
  115. def compress_tokens(tokens):
  116. """
  117. Combine adjacent tokens when there is no HTML between the tokens,
  118. and they share an annotation
  119. """
  120. result = [tokens[0]]
  121. for tok in tokens[1:]:
  122. if (not tok.pre_tags and
  123. not result[-1].post_tags and
  124. result[-1].annotation == tok.annotation):
  125. compress_merge_back(result, tok)
  126. else:
  127. result.append(tok)
  128. return result
  129. @cython.cfunc
  130. def compress_merge_back(tokens: list, tok):
  131. """ Merge tok into the last element of tokens (modifying the list of
  132. tokens in-place). """
  133. last = tokens[-1]
  134. if type(last) is not token or type(tok) is not token:
  135. tokens.append(tok)
  136. else:
  137. text = last + last.trailing_whitespace + tok
  138. merged = token(text,
  139. pre_tags=last.pre_tags,
  140. post_tags=tok.post_tags,
  141. trailing_whitespace=tok.trailing_whitespace)
  142. merged.annotation = last.annotation
  143. tokens[-1] = merged
  144. def markup_serialize_tokens(tokens, markup_func):
  145. """
  146. Serialize the list of tokens into a list of text chunks, calling
  147. markup_func around text to add annotations.
  148. """
  149. for token in tokens:
  150. yield from token.pre_tags
  151. html = token.html()
  152. html = markup_func(html, token.annotation) + token.trailing_whitespace
  153. yield html
  154. yield from token.post_tags
  155. ############################################################
  156. ## HTML Diffs
  157. ############################################################
  158. def htmldiff(old_html, new_html):
  159. ## FIXME: this should take parsed documents too, and use their body
  160. ## or other content.
  161. """ Do a diff of the old and new document. The documents are HTML
  162. *fragments* (str/UTF8 or unicode), they are not complete documents
  163. (i.e., no <html> tag).
  164. Returns HTML with <ins> and <del> tags added around the
  165. appropriate text.
  166. Markup is generally ignored, with the markup from new_html
  167. preserved, and possibly some markup from old_html (though it is
  168. considered acceptable to lose some of the old markup). Only the
  169. words in the HTML are diffed. The exception is <img> tags, which
  170. are treated like words, and the href attribute of <a> tags, which
  171. are noted inside the tag itself when there are changes.
  172. """
  173. old_html_tokens = tokenize(old_html)
  174. new_html_tokens = tokenize(new_html)
  175. result = htmldiff_tokens(old_html_tokens, new_html_tokens)
  176. try:
  177. result = ''.join(result).strip()
  178. except (ValueError, TypeError) as exc:
  179. print(exc)
  180. result = ''
  181. return fixup_ins_del_tags(result)
  182. def htmldiff_tokens(html1_tokens, html2_tokens):
  183. """ Does a diff on the tokens themselves, returning a list of text
  184. chunks (not tokens).
  185. """
  186. # There are several passes as we do the differences. The tokens
  187. # isolate the portion of the content we care to diff; difflib does
  188. # all the actual hard work at that point.
  189. #
  190. # Then we must create a valid document from pieces of both the old
  191. # document and the new document. We generally prefer to take
  192. # markup from the new document, and only do a best effort attempt
  193. # to keep markup from the old document; anything that we can't
  194. # resolve we throw away. Also we try to put the deletes as close
  195. # to the location where we think they would have been -- because
  196. # we are only keeping the markup from the new document, it can be
  197. # fuzzy where in the new document the old text would have gone.
  198. # Again we just do a best effort attempt.
  199. s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
  200. commands = s.get_opcodes()
  201. result = []
  202. for command, i1, i2, j1, j2 in commands:
  203. if command == 'equal':
  204. result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
  205. continue
  206. if command == 'insert' or command == 'replace':
  207. ins_tokens = expand_tokens(html2_tokens[j1:j2])
  208. merge_insert(ins_tokens, result)
  209. if command == 'delete' or command == 'replace':
  210. del_tokens = expand_tokens(html1_tokens[i1:i2])
  211. merge_delete(del_tokens, result)
  212. # If deletes were inserted directly as <del> then we'd have an
  213. # invalid document at this point. Instead we put in special
  214. # markers, and when the complete diffed document has been created
  215. # we try to move the deletes around and resolve any problems.
  216. cleanup_delete(result)
  217. return result
  218. def expand_tokens(tokens, equal=False):
  219. """Given a list of tokens, return a generator of the chunks of
  220. text for the data in the tokens.
  221. """
  222. for token in tokens:
  223. yield from token.pre_tags
  224. if not equal or not token.hide_when_equal:
  225. yield token.html() + token.trailing_whitespace
  226. yield from token.post_tags
  227. def merge_insert(ins_chunks, doc: list):
  228. """ doc is the already-handled document (as a list of text chunks);
  229. here we add <ins>ins_chunks</ins> to the end of that. """
  230. # Though we don't throw away unbalanced start/end tags
  231. # (we assume there is accompanying markup later or earlier in the
  232. # document), we only put <ins> around the balanced portion.
  233. # Legacy note: We make a choice here. Originally, we merged all sequences of
  234. # unbalanced tags together into separate start and end tag groups. Now, we look at
  235. # each sequence separately, leading to more fine-grained diffs but different
  236. # tag structure than before.
  237. item: tuple
  238. for balanced, marked_chunks in group_by_first_item(mark_unbalanced(ins_chunks)):
  239. chunks = [item[1] for item in marked_chunks]
  240. if balanced == 'b':
  241. if doc and not doc[-1].endswith(' '):
  242. # Fix up the case where the word before the insert didn't end with a space.
  243. doc[-1] += ' '
  244. doc.append('<ins>')
  245. doc.extend(chunks)
  246. if doc[-1].endswith(' '):
  247. # We move space outside of </ins>.
  248. doc[-1] = doc[-1][:-1]
  249. doc.append('</ins> ')
  250. else:
  251. # unmatched start or end
  252. doc.extend(chunks)
  253. @cython.cfunc
  254. def tag_name_of_chunk(chunk: str) -> str:
  255. i: cython.Py_ssize_t
  256. ch: cython.Py_UCS4
  257. if chunk[0] != '<':
  258. return ""
  259. start_pos = 1
  260. for i, ch in enumerate(chunk):
  261. if ch == '/':
  262. start_pos = 2
  263. elif ch == '>':
  264. return chunk[start_pos:i]
  265. elif ch.isspace():
  266. return chunk[start_pos:i]
  267. return chunk[start_pos:]
  268. if not cython.compiled:
  269. # Avoid performance regression in Python due to string iteration.
  270. def tag_name_of_chunk(chunk: str) -> str:
  271. return chunk.split(None, 1)[0].strip('<>/')
  272. # These are sentinels to represent the start and end of a <del>
  273. # segment, until we do the cleanup phase to turn them into proper
  274. # markup:
  275. class DEL_START:
  276. pass
  277. class DEL_END:
  278. pass
  279. def merge_delete(del_chunks, doc: list):
  280. """ Adds the text chunks in del_chunks to the document doc (another
  281. list of text chunks) with marker to show it is a delete.
  282. cleanup_delete later resolves these markers into <del> tags."""
  283. doc.append(DEL_START)
  284. doc.extend(del_chunks)
  285. doc.append(DEL_END)
  286. def cleanup_delete(chunks: list):
  287. """ Cleans up any DEL_START/DEL_END markers in the document, replacing
  288. them with <del></del>. To do this while keeping the document
  289. valid, it may need to drop some tags (either start or end tags).
  290. It may also move the del into adjacent tags to try to move it to a
  291. similar location where it was originally located (e.g., moving a
  292. delete into preceding <div> tag, if the del looks like (DEL_START,
  293. 'Text</div>', DEL_END)
  294. """
  295. chunk_count = len(chunks)
  296. i: cython.Py_ssize_t
  297. del_start: cython.Py_ssize_t
  298. del_end: cython.Py_ssize_t
  299. shift_start_right: cython.Py_ssize_t
  300. shift_end_left: cython.Py_ssize_t
  301. unbalanced_start: cython.Py_ssize_t
  302. unbalanced_end: cython.Py_ssize_t
  303. pos: cython.Py_ssize_t
  304. start_pos: cython.Py_ssize_t
  305. chunk: str
  306. start_pos = 0
  307. while 1:
  308. # Find a pending DEL_START/DEL_END, splitting the document
  309. # into stuff-preceding-DEL_START, stuff-inside, and
  310. # stuff-following-DEL_END
  311. try:
  312. del_start = chunks.index(DEL_START, start_pos)
  313. except ValueError:
  314. # Nothing found, we've cleaned up the entire doc
  315. break
  316. else:
  317. del_end = chunks.index(DEL_END, del_start + 1)
  318. shift_end_left = shift_start_right = 0
  319. unbalanced_start = unbalanced_end = 0
  320. deleted_chunks = mark_unbalanced(chunks[del_start+1:del_end])
  321. # For unbalanced start tags at the beginning, find matching (non-deleted)
  322. # end tags after the current DEL_END and move the start tag outside.
  323. for balanced, del_chunk in deleted_chunks:
  324. if balanced != 'us':
  325. break
  326. unbalanced_start += 1
  327. unbalanced_start_name = tag_name_of_chunk(del_chunk)
  328. for i in range(del_end+1, chunk_count):
  329. if chunks[i] is DEL_START:
  330. break
  331. chunk = chunks[i]
  332. if chunk[0] != '<' or chunk[1] == '/':
  333. # Reached a word or closing tag.
  334. break
  335. name = tag_name_of_chunk(chunk)
  336. if name == 'ins':
  337. # Cannot move into an insert.
  338. break
  339. assert name != 'del', f"Unexpected delete tag: {chunk!r}"
  340. if name != unbalanced_start_name:
  341. # Avoid mixing in other start tags.
  342. break
  343. # Exclude start tag to balance the end tag.
  344. shift_start_right += 1
  345. # For unbalanced end tags at the end, find matching (non-deleted)
  346. # start tags before the currend DEL_START and move the end tag outside.
  347. for balanced, del_chunk in reversed(deleted_chunks):
  348. if balanced != 'ue':
  349. break
  350. unbalanced_end += 1
  351. unbalanced_end_name = tag_name_of_chunk(del_chunk)
  352. for i in range(del_start - 1, -1, -1):
  353. if chunks[i] is DEL_END:
  354. break
  355. chunk = chunks[i]
  356. if chunk[0] == '<' and chunk[1] != '/':
  357. # Reached an opening tag, can we go further? Maybe not...
  358. break
  359. name = tag_name_of_chunk(chunk)
  360. if name == 'ins' or name == 'del':
  361. # Cannot move into an insert or delete.
  362. break
  363. if name != unbalanced_end_name:
  364. # Avoid mixing in other start tags.
  365. break
  366. # Exclude end tag to balance the start tag.
  367. shift_end_left += 1
  368. """
  369. # This is what we do below in loops, spelled out using slicing and list copying:
  370. chunks[del_start - shift_end_left : del_end + shift_start_right + 1] = [
  371. *chunks[del_start + 1: del_start + shift_start_right + 1],
  372. '<del>',
  373. *chunks[del_start + unbalanced_start + 1 : del_end - unbalanced_end],
  374. '</del> ',
  375. *chunks[del_end - shift_end_left: del_end],
  376. ]
  377. new_del_end = del_end - 2 * shift_end_left
  378. assert chunks[new_del_end] == '</del> '
  379. del_end = new_del_end
  380. if new_del_start > 0 and not chunks[new_del_start - 1].endswith(' '):
  381. # Fix up case where the word before us didn't have a trailing space.
  382. chunks[new_del_start - 1] += ' '
  383. if new_del_end > 0 and chunks[new_del_end - 1].endswith(' '):
  384. # Move space outside of </del>.
  385. chunks[new_del_end - 1] = chunks[new_del_end - 1][:-1]
  386. """
  387. pos = del_start - shift_end_left
  388. # Move re-balanced start tags before the '<del>'.
  389. for i in range(del_start + 1, del_start + shift_start_right + 1):
  390. chunks[pos] = chunks[i]
  391. pos += 1
  392. if pos and not chunks[pos - 1].endswith(' '):
  393. # Fix up the case where the word before '<del>' didn't have a trailing space.
  394. chunks[pos - 1] += ' '
  395. chunks[pos] = '<del>'
  396. pos += 1
  397. # Copy only the balanced deleted content between '<del>' and '</del>'.
  398. for i in range(del_start + unbalanced_start + 1, del_end - unbalanced_end):
  399. chunks[pos] = chunks[i]
  400. pos += 1
  401. if chunks[pos - 1].endswith(' '):
  402. # Move trailing space outside of </del>.
  403. chunks[pos - 1] = chunks[pos - 1][:-1]
  404. chunks[pos] = '</del> '
  405. pos += 1
  406. # Move re-balanced end tags after the '</del>'.
  407. for i in range(del_end - shift_end_left, del_end):
  408. chunks[pos] = chunks[i]
  409. pos += 1
  410. # Adjust the length of the processed part in 'chunks'.
  411. del chunks[pos : del_end + shift_start_right + 1]
  412. start_pos = pos
  413. @cython.cfunc
  414. def mark_unbalanced(chunks) -> list:
  415. tag_stack = []
  416. marked = []
  417. chunk: str
  418. parents: list
  419. for chunk in chunks:
  420. if not chunk.startswith('<'):
  421. marked.append(('b', chunk))
  422. continue
  423. name = tag_name_of_chunk(chunk)
  424. if name in empty_tags:
  425. marked.append(('b', chunk))
  426. continue
  427. if chunk[1] == '/':
  428. # closing tag found, unwind tag stack
  429. while tag_stack:
  430. start_name, start_chunk, parents = tag_stack.pop()
  431. if start_name == name:
  432. # balanced tag closing, keep rest of stack intact
  433. parents.append(('b', start_chunk))
  434. parents.extend(marked)
  435. parents.append(('b', chunk))
  436. marked = parents
  437. chunk = None
  438. break
  439. else:
  440. # unmatched start tag
  441. parents.append(('us', start_chunk))
  442. parents.extend(marked)
  443. marked = parents
  444. if chunk is not None:
  445. # unmatched end tag left after clearing the stack
  446. marked.append(('ue', chunk))
  447. else:
  448. # new start tag found
  449. tag_stack.append((name, chunk, marked))
  450. marked = []
  451. # add any unbalanced start tags
  452. while tag_stack:
  453. _, start_chunk, parents = tag_stack.pop()
  454. parents.append(('us', start_chunk))
  455. parents.extend(marked)
  456. marked = parents
  457. return marked
  458. class token(str):
  459. """ Represents a diffable token, generally a word that is displayed to
  460. the user. Opening tags are attached to this token when they are
  461. adjacent (pre_tags) and closing tags that follow the word
  462. (post_tags). Some exceptions occur when there are empty tags
  463. adjacent to a word, so there may be close tags in pre_tags, or
  464. open tags in post_tags.
  465. We also keep track of whether the word was originally followed by
  466. whitespace, even though we do not want to treat the word as
  467. equivalent to a similar word that does not have a trailing
  468. space."""
  469. # When this is true, the token will be eliminated from the
  470. # displayed diff if no change has occurred:
  471. hide_when_equal = False
  472. def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
  473. obj = str.__new__(cls, text)
  474. obj.pre_tags = pre_tags if pre_tags is not None else []
  475. obj.post_tags = post_tags if post_tags is not None else []
  476. obj.trailing_whitespace = trailing_whitespace
  477. return obj
  478. def __repr__(self):
  479. return 'token(%s, %r, %r, %r)' % (
  480. str.__repr__(self), self.pre_tags, self.post_tags, self.trailing_whitespace)
  481. def html(self):
  482. return str(self)
  483. class tag_token(token):
  484. """ Represents a token that is actually a tag. Currently this is just
  485. the <img> tag, which takes up visible space just like a word but
  486. is only represented in a document by a tag. """
  487. def __new__(cls, tag, data, html_repr, pre_tags=None,
  488. post_tags=None, trailing_whitespace=""):
  489. obj = token.__new__(cls, f"{type}: {data}",
  490. pre_tags=pre_tags,
  491. post_tags=post_tags,
  492. trailing_whitespace=trailing_whitespace)
  493. obj.tag = tag
  494. obj.data = data
  495. obj.html_repr = html_repr
  496. return obj
  497. def __repr__(self):
  498. return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
  499. self.tag,
  500. self.data,
  501. self.html_repr,
  502. self.pre_tags,
  503. self.post_tags,
  504. self.trailing_whitespace)
  505. def html(self):
  506. return self.html_repr
  507. class href_token(token):
  508. """ Represents the href in an anchor tag. Unlike other words, we only
  509. show the href when it changes. """
  510. hide_when_equal = True
  511. def html(self):
  512. return ' Link: %s' % self
  513. def tokenize(html, include_hrefs=True):
  514. """
  515. Parse the given HTML and returns token objects (words with attached tags).
  516. This parses only the content of a page; anything in the head is
  517. ignored, and the <head> and <body> elements are themselves
  518. optional. The content is then parsed by lxml, which ensures the
  519. validity of the resulting parsed document (though lxml may make
  520. incorrect guesses when the markup is particular bad).
  521. <ins> and <del> tags are also eliminated from the document, as
  522. that gets confusing.
  523. If include_hrefs is true, then the href attribute of <a> tags is
  524. included as a special kind of diffable token."""
  525. if etree.iselement(html):
  526. body_el = html
  527. else:
  528. body_el = parse_html(html, cleanup=True)
  529. # Then we split the document into text chunks for each tag, word, and end tag:
  530. chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
  531. # Finally re-joining them into token objects:
  532. return fixup_chunks(chunks)
  533. def parse_html(html, cleanup=True):
  534. """
  535. Parses an HTML fragment, returning an lxml element. Note that the HTML will be
  536. wrapped in a <div> tag that was not in the original document.
  537. If cleanup is true, make sure there's no <head> or <body>, and get
  538. rid of any <ins> and <del> tags.
  539. """
  540. if cleanup:
  541. # This removes any extra markup or structure like <head>:
  542. html = cleanup_html(html)
  543. return fragment_fromstring(html, create_parent=True)
  544. _search_body = re.compile(r'<body.*?>', re.I|re.S).search
  545. _search_end_body = re.compile(r'</body.*?>', re.I|re.S).search
  546. _replace_ins_del = re.compile(r'</?(ins|del).*?>', re.I|re.S).sub
  547. def cleanup_html(html):
  548. """ This 'cleans' the HTML, meaning that any page structure is removed
  549. (only the contents of <body> are used, if there is any <body).
  550. Also <ins> and <del> tags are removed. """
  551. match = _search_body(html)
  552. if match:
  553. html = html[match.end():]
  554. match = _search_end_body(html)
  555. if match:
  556. html = html[:match.start()]
  557. html = _replace_ins_del('', html)
  558. return html
  559. def split_trailing_whitespace(word):
  560. """
  561. This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
  562. """
  563. stripped_length = len(word.rstrip())
  564. return word[0:stripped_length], word[stripped_length:]
  565. def fixup_chunks(chunks):
  566. """
  567. This function takes a list of chunks and produces a list of tokens.
  568. """
  569. tag_accum = []
  570. cur_word = None
  571. result = []
  572. for chunk in chunks:
  573. if isinstance(chunk, tuple):
  574. if chunk[0] == 'img':
  575. src = chunk[1]
  576. tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
  577. cur_word = tag_token('img', src, html_repr=tag,
  578. pre_tags=tag_accum,
  579. trailing_whitespace=trailing_whitespace)
  580. tag_accum = []
  581. result.append(cur_word)
  582. elif chunk[0] == 'href':
  583. href = chunk[1]
  584. cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
  585. tag_accum = []
  586. result.append(cur_word)
  587. continue
  588. if is_word(chunk):
  589. chunk, trailing_whitespace = split_trailing_whitespace(chunk)
  590. cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
  591. tag_accum = []
  592. result.append(cur_word)
  593. elif is_start_tag(chunk):
  594. tag_accum.append(chunk)
  595. elif is_end_tag(chunk):
  596. if tag_accum:
  597. tag_accum.append(chunk)
  598. else:
  599. assert cur_word, (
  600. "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
  601. % (cur_word, result, chunk, chunks))
  602. cur_word.post_tags.append(chunk)
  603. else:
  604. assert False
  605. if not result:
  606. return [token('', pre_tags=tag_accum)]
  607. else:
  608. result[-1].post_tags.extend(tag_accum)
  609. return result
  610. # All the tags in HTML that don't require end tags:
  611. empty_tags = cython.declare(frozenset, defs.empty_tags)
  612. block_level_tags = cython.declare(frozenset, frozenset([
  613. 'address',
  614. 'blockquote',
  615. 'center',
  616. 'dir',
  617. 'div',
  618. 'dl',
  619. 'fieldset',
  620. 'form',
  621. 'h1',
  622. 'h2',
  623. 'h3',
  624. 'h4',
  625. 'h5',
  626. 'h6',
  627. 'hr',
  628. 'isindex',
  629. 'menu',
  630. 'noframes',
  631. 'noscript',
  632. 'ol',
  633. 'p',
  634. 'pre',
  635. 'table',
  636. 'ul',
  637. ]))
  638. block_level_container_tags = cython.declare(frozenset, frozenset([
  639. 'dd',
  640. 'dt',
  641. 'frameset',
  642. 'li',
  643. 'tbody',
  644. 'td',
  645. 'tfoot',
  646. 'th',
  647. 'thead',
  648. 'tr',
  649. ]))
  650. any_block_level_tag = cython.declare(tuple, tuple(sorted(
  651. block_level_tags | block_level_container_tags))
  652. )
  653. def flatten_el(el, include_hrefs, skip_tag=False):
  654. """ Takes an lxml element el, and generates all the text chunks for
  655. that tag. Each start tag is a chunk, each word is a chunk, and each
  656. end tag is a chunk.
  657. If skip_tag is true, then the outermost container tag is
  658. not returned (just its contents)."""
  659. if not skip_tag:
  660. if el.tag == 'img':
  661. yield ('img', el.get('src'), start_tag(el))
  662. else:
  663. yield start_tag(el)
  664. if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
  665. return
  666. start_words = split_words(el.text)
  667. for word in start_words:
  668. yield html_escape(word)
  669. for child in el:
  670. yield from flatten_el(child, include_hrefs=include_hrefs)
  671. if el.tag == 'a' and el.get('href') and include_hrefs:
  672. yield ('href', el.get('href'))
  673. if not skip_tag:
  674. yield end_tag(el)
  675. end_words = split_words(el.tail)
  676. for word in end_words:
  677. yield html_escape(word)
  678. _find_words = re.compile(r'\S+(?:\s+|$)', re.U).findall
  679. def split_words(text):
  680. """ Splits some text into words. Includes trailing whitespace
  681. on each word when appropriate. """
  682. if not text or not text.strip():
  683. return []
  684. words = _find_words(text)
  685. return words
  686. _has_start_whitespace = re.compile(r'^[ \t\n\r]').match
  687. def start_tag(el):
  688. """
  689. The text representation of the start tag for a tag.
  690. """
  691. attributes = ''.join([
  692. f' {name}="{html_escape(value)}"'
  693. for name, value in el.attrib.items()
  694. ])
  695. return f'<{el.tag}{attributes}>'
  696. def end_tag(el):
  697. """ The text representation of an end tag for a tag. Includes
  698. trailing whitespace when appropriate. """
  699. tail = el.tail
  700. extra = ' ' if tail and _has_start_whitespace(tail) else ''
  701. return f'</{el.tag}>{extra}'
  702. def is_word(tok):
  703. return not tok.startswith('<')
  704. def is_end_tag(tok):
  705. return tok.startswith('</')
  706. def is_start_tag(tok):
  707. return tok.startswith('<') and not tok.startswith('</')
  708. def fixup_ins_del_tags(html):
  709. """ Given an html string, move any <ins> or <del> tags inside of any
  710. block-level elements, e.g. transform <ins><p>word</p></ins> to
  711. <p><ins>word</ins></p> """
  712. doc = parse_html(html, cleanup=False)
  713. _fixup_ins_del_tags(doc)
  714. html = serialize_html_fragment(doc, skip_outer=True)
  715. return html
  716. def serialize_html_fragment(el, skip_outer=False):
  717. """ Serialize a single lxml element as HTML. The serialized form
  718. includes the elements tail.
  719. If skip_outer is true, then don't serialize the outermost tag
  720. """
  721. assert not isinstance(el, str), (
  722. f"You should pass in an element, not a string like {el!r}")
  723. html = etree.tostring(el, method="html", encoding='unicode')
  724. if skip_outer:
  725. # Get rid of the extra starting tag:
  726. html = html[html.find('>')+1:]
  727. # Get rid of the extra end tag:
  728. html = html[:html.rfind('<')]
  729. return html.strip()
  730. else:
  731. return html
  732. @cython.cfunc
  733. def _fixup_ins_del_tags(doc):
  734. """fixup_ins_del_tags that works on an lxml document in-place
  735. """
  736. for el in list(doc.iter('ins', 'del')):
  737. if not _contains_block_level_tag(el):
  738. continue
  739. _move_el_inside_block(el, tag=el.tag)
  740. el.drop_tag()
  741. #_merge_element_contents(el)
  742. @cython.cfunc
  743. def _contains_block_level_tag(el):
  744. """True if the element contains any block-level elements, like <p>, <td>, etc.
  745. """
  746. for el in el.iter(*any_block_level_tag):
  747. return True
  748. return False
  749. @cython.cfunc
  750. def _move_el_inside_block(el, tag):
  751. """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
  752. and moves them inside any block-level tags. """
  753. makeelement = el.makeelement
  754. for block_level_el in el.iter(*any_block_level_tag):
  755. if block_level_el is not el:
  756. break
  757. else:
  758. # No block-level tags in any child
  759. children_tag = makeelement(tag)
  760. children_tag.text = el.text
  761. el.text = None
  762. children_tag.extend(iter(el))
  763. el[:] = [children_tag]
  764. return
  765. for child in list(el):
  766. if _contains_block_level_tag(child):
  767. _move_el_inside_block(child, tag)
  768. if child.tail:
  769. tail_tag = makeelement(tag)
  770. tail_tag.text = child.tail
  771. child.tail = None
  772. child.addnext(tail_tag)
  773. else:
  774. child_tag = makeelement(tag)
  775. el.replace(child, child_tag)
  776. child_tag.append(child)
  777. if el.text:
  778. text_tag = makeelement(tag)
  779. text_tag.text = el.text
  780. el.text = None
  781. el.insert(0, text_tag)
  782. def _merge_element_contents(el):
  783. """
  784. Removes an element, but merges its contents into its place, e.g.,
  785. given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
  786. <p>Hi there!</p>
  787. """
  788. parent = el.getparent()
  789. text = el.text
  790. tail = el.tail
  791. if tail:
  792. if not len(el):
  793. text = (text or '') + tail
  794. else:
  795. el[-1].tail = (el[-1].tail or '') + tail
  796. index = parent.index(el)
  797. if text:
  798. previous = el.getprevious()
  799. if previous is None:
  800. parent.text = (parent.text or '') + text
  801. else:
  802. previous.tail = (previous.tail or '') + text
  803. parent[index:index+1] = el.getchildren()
  804. @cython.final
  805. @cython.cclass
  806. class InsensitiveSequenceMatcher(SequenceMatcher):
  807. """
  808. Acts like SequenceMatcher, but tries not to find very small equal
  809. blocks amidst large spans of changes
  810. """
  811. threshold = 2
  812. @cython.cfunc
  813. def get_matching_blocks(self) -> list:
  814. size: cython.Py_ssize_t = min(len(self.b), len(self.b))
  815. threshold: cython.Py_ssize_t = self.threshold
  816. threshold = min(threshold, size // 4)
  817. actual = SequenceMatcher.get_matching_blocks(self)
  818. return [item for item in actual
  819. if item[2] > threshold
  820. or not item[2]]
  821. if __name__ == '__main__':
  822. from lxml.html import _diffcommand
  823. _diffcommand.main()