No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.
 
 
 
 

216 líneas
8.3 KiB

  1. # functions for tree cleanup and removing elements from subtrees
  2. def cleanup_namespaces(tree_or_element, top_nsmap=None, keep_ns_prefixes=None):
  3. """cleanup_namespaces(tree_or_element, top_nsmap=None, keep_ns_prefixes=None)
  4. Remove all namespace declarations from a subtree that are not used
  5. by any of the elements or attributes in that tree.
  6. If a 'top_nsmap' is provided, it must be a mapping from prefixes
  7. to namespace URIs. These namespaces will be declared on the top
  8. element of the subtree before running the cleanup, which allows
  9. moving namespace declarations to the top of the tree.
  10. If a 'keep_ns_prefixes' is provided, it must be a list of prefixes.
  11. These prefixes will not be removed as part of the cleanup.
  12. """
  13. element = _rootNodeOrRaise(tree_or_element)
  14. c_element = element._c_node
  15. if top_nsmap:
  16. doc = element._doc
  17. # declare namespaces from nsmap, then apply them to the subtree
  18. _setNodeNamespaces(c_element, doc, None, top_nsmap)
  19. moveNodeToDocument(doc, c_element.doc, c_element)
  20. keep_ns_prefixes = (
  21. set([_utf8(prefix) for prefix in keep_ns_prefixes])
  22. if keep_ns_prefixes else None)
  23. _removeUnusedNamespaceDeclarations(c_element, keep_ns_prefixes)
  24. def strip_attributes(tree_or_element, *attribute_names):
  25. """strip_attributes(tree_or_element, *attribute_names)
  26. Delete all attributes with the provided attribute names from an
  27. Element (or ElementTree) and its descendants.
  28. Attribute names can contain wildcards as in `_Element.iter`.
  29. Example usage::
  30. strip_attributes(root_element,
  31. 'simpleattr',
  32. '{http://some/ns}attrname',
  33. '{http://other/ns}*')
  34. """
  35. cdef _MultiTagMatcher matcher
  36. element = _rootNodeOrRaise(tree_or_element)
  37. if not attribute_names:
  38. return
  39. matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, attribute_names)
  40. matcher.cacheTags(element._doc)
  41. if matcher.rejectsAllAttributes():
  42. return
  43. _strip_attributes(element._c_node, matcher)
  44. cdef _strip_attributes(xmlNode* c_node, _MultiTagMatcher matcher):
  45. cdef xmlAttr* c_attr
  46. cdef xmlAttr* c_next_attr
  47. tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
  48. if c_node.type == tree.XML_ELEMENT_NODE:
  49. c_attr = c_node.properties
  50. while c_attr is not NULL:
  51. c_next_attr = c_attr.next
  52. if matcher.matchesAttribute(c_attr):
  53. tree.xmlRemoveProp(c_attr)
  54. c_attr = c_next_attr
  55. tree.END_FOR_EACH_ELEMENT_FROM(c_node)
  56. def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
  57. """strip_elements(tree_or_element, *tag_names, with_tail=True)
  58. Delete all elements with the provided tag names from a tree or
  59. subtree. This will remove the elements and their entire subtree,
  60. including all their attributes, text content and descendants. It
  61. will also remove the tail text of the element unless you
  62. explicitly set the ``with_tail`` keyword argument option to False.
  63. Tag names can contain wildcards as in `_Element.iter`.
  64. Note that this will not delete the element (or ElementTree root
  65. element) that you passed even if it matches. It will only treat
  66. its descendants. If you want to include the root element, check
  67. its tag name directly before even calling this function.
  68. Example usage::
  69. strip_elements(some_element,
  70. 'simpletagname', # non-namespaced tag
  71. '{http://some/ns}tagname', # namespaced tag
  72. '{http://some/other/ns}*' # any tag from a namespace
  73. lxml.etree.Comment # comments
  74. )
  75. """
  76. cdef _MultiTagMatcher matcher
  77. doc = _documentOrRaise(tree_or_element)
  78. element = _rootNodeOrRaise(tree_or_element)
  79. if not tag_names:
  80. return
  81. matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag_names)
  82. matcher.cacheTags(doc)
  83. if matcher.rejectsAll():
  84. return
  85. if isinstance(tree_or_element, _ElementTree):
  86. # include PIs and comments next to the root node
  87. if matcher.matchesType(tree.XML_COMMENT_NODE):
  88. _removeSiblings(element._c_node, tree.XML_COMMENT_NODE, with_tail)
  89. if matcher.matchesType(tree.XML_PI_NODE):
  90. _removeSiblings(element._c_node, tree.XML_PI_NODE, with_tail)
  91. _strip_elements(doc, element._c_node, matcher, with_tail)
  92. cdef _strip_elements(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher,
  93. bint with_tail):
  94. cdef xmlNode* c_child
  95. cdef xmlNode* c_next
  96. tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
  97. if c_node.type == tree.XML_ELEMENT_NODE:
  98. # we run through the children here to prevent any problems
  99. # with the tree iteration which would occur if we unlinked the
  100. # c_node itself
  101. c_child = _findChildForwards(c_node, 0)
  102. while c_child is not NULL:
  103. c_next = _nextElement(c_child)
  104. if matcher.matches(c_child):
  105. if c_child.type == tree.XML_ELEMENT_NODE:
  106. if not with_tail:
  107. tree.xmlUnlinkNode(c_child)
  108. _removeNode(doc, c_child)
  109. else:
  110. if with_tail:
  111. _removeText(c_child.next)
  112. tree.xmlUnlinkNode(c_child)
  113. attemptDeallocation(c_child)
  114. c_child = c_next
  115. tree.END_FOR_EACH_ELEMENT_FROM(c_node)
  116. def strip_tags(tree_or_element, *tag_names):
  117. """strip_tags(tree_or_element, *tag_names)
  118. Delete all elements with the provided tag names from a tree or
  119. subtree. This will remove the elements and their attributes, but
  120. *not* their text/tail content or descendants. Instead, it will
  121. merge the text content and children of the element into its
  122. parent.
  123. Tag names can contain wildcards as in `_Element.iter`.
  124. Note that this will not delete the element (or ElementTree root
  125. element) that you passed even if it matches. It will only treat
  126. its descendants.
  127. Example usage::
  128. strip_tags(some_element,
  129. 'simpletagname', # non-namespaced tag
  130. '{http://some/ns}tagname', # namespaced tag
  131. '{http://some/other/ns}*' # any tag from a namespace
  132. Comment # comments (including their text!)
  133. )
  134. """
  135. cdef _MultiTagMatcher matcher
  136. doc = _documentOrRaise(tree_or_element)
  137. element = _rootNodeOrRaise(tree_or_element)
  138. if not tag_names:
  139. return
  140. matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag_names)
  141. matcher.cacheTags(doc)
  142. if matcher.rejectsAll():
  143. return
  144. if isinstance(tree_or_element, _ElementTree):
  145. # include PIs and comments next to the root node
  146. if matcher.matchesType(tree.XML_COMMENT_NODE):
  147. _removeSiblings(element._c_node, tree.XML_COMMENT_NODE, 0)
  148. if matcher.matchesType(tree.XML_PI_NODE):
  149. _removeSiblings(element._c_node, tree.XML_PI_NODE, 0)
  150. _strip_tags(doc, element._c_node, matcher)
  151. cdef _strip_tags(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher):
  152. cdef xmlNode* c_child
  153. cdef xmlNode* c_next
  154. tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
  155. if c_node.type == tree.XML_ELEMENT_NODE:
  156. # we run through the children here to prevent any problems
  157. # with the tree iteration which would occur if we unlinked the
  158. # c_node itself
  159. c_child = _findChildForwards(c_node, 0)
  160. while c_child is not NULL:
  161. if not matcher.matches(c_child):
  162. c_child = _nextElement(c_child)
  163. continue
  164. if c_child.type == tree.XML_ELEMENT_NODE:
  165. c_next = _findChildForwards(c_child, 0) or _nextElement(c_child)
  166. _replaceNodeByChildren(doc, c_child)
  167. if not attemptDeallocation(c_child):
  168. if c_child.nsDef is not NULL:
  169. # make namespaces absolute
  170. moveNodeToDocument(doc, doc._c_doc, c_child)
  171. c_child = c_next
  172. else:
  173. c_next = _nextElement(c_child)
  174. tree.xmlUnlinkNode(c_child)
  175. attemptDeallocation(c_child)
  176. c_child = c_next
  177. tree.END_FOR_EACH_ELEMENT_FROM(c_node)