Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.
 
 
 
 

1802 wiersze
62 KiB

  1. # Private/public helper functions for API functions
  2. from lxml.includes cimport uri
  3. cdef void displayNode(xmlNode* c_node, indent) noexcept:
  4. # to help with debugging
  5. cdef xmlNode* c_child
  6. try:
  7. print(indent * ' ', <long>c_node)
  8. c_child = c_node.children
  9. while c_child is not NULL:
  10. displayNode(c_child, indent + 1)
  11. c_child = c_child.next
  12. finally:
  13. return # swallow any exceptions
  14. cdef inline bint _isHtmlDocument(_Element element) except -1:
  15. cdef xmlNode* c_node = element._c_node
  16. return (
  17. c_node is not NULL and c_node.doc is not NULL and
  18. c_node.doc.properties & tree.XML_DOC_HTML != 0
  19. )
  20. cdef inline int _assertValidNode(_Element element) except -1:
  21. assert element._c_node is not NULL, "invalid Element proxy at %s" % id(element)
  22. cdef inline int _assertValidDoc(_Document doc) except -1:
  23. assert doc._c_doc is not NULL, "invalid Document proxy at %s" % id(doc)
  24. cdef _Document _documentOrRaise(object input):
  25. """Call this to get the document of a _Document, _ElementTree or _Element
  26. object, or to raise an exception if it can't be determined.
  27. Should be used in all API functions for consistency.
  28. """
  29. cdef _Document doc
  30. if isinstance(input, _ElementTree):
  31. if (<_ElementTree>input)._context_node is not None:
  32. doc = (<_ElementTree>input)._context_node._doc
  33. else:
  34. doc = None
  35. elif isinstance(input, _Element):
  36. doc = (<_Element>input)._doc
  37. elif isinstance(input, _Document):
  38. doc = <_Document>input
  39. else:
  40. raise TypeError, f"Invalid input object: {python._fqtypename(input).decode('utf8')}"
  41. if doc is None:
  42. raise ValueError, f"Input object has no document: {python._fqtypename(input).decode('utf8')}"
  43. _assertValidDoc(doc)
  44. return doc
  45. cdef _Element _rootNodeOrRaise(object input):
  46. """Call this to get the root node of a _Document, _ElementTree or
  47. _Element object, or to raise an exception if it can't be determined.
  48. Should be used in all API functions for consistency.
  49. """
  50. cdef _Element node
  51. if isinstance(input, _ElementTree):
  52. node = (<_ElementTree>input)._context_node
  53. elif isinstance(input, _Element):
  54. node = <_Element>input
  55. elif isinstance(input, _Document):
  56. node = (<_Document>input).getroot()
  57. else:
  58. raise TypeError, f"Invalid input object: {python._fqtypename(input).decode('utf8')}"
  59. if (node is None or not node._c_node or
  60. node._c_node.type != tree.XML_ELEMENT_NODE):
  61. raise ValueError, f"Input object is not an XML element: {python._fqtypename(input).decode('utf8')}"
  62. _assertValidNode(node)
  63. return node
  64. cdef bint _isAncestorOrSame(xmlNode* c_ancestor, xmlNode* c_node) noexcept:
  65. while c_node:
  66. if c_node is c_ancestor:
  67. return True
  68. c_node = c_node.parent
  69. return False
  70. cdef _Element _makeElement(tag, xmlDoc* c_doc, _Document doc,
  71. _BaseParser parser, text, tail, attrib, nsmap,
  72. dict extra_attrs):
  73. """Create a new element and initialize text content, namespaces and
  74. attributes.
  75. This helper function will reuse as much of the existing document as
  76. possible:
  77. If 'parser' is None, the parser will be inherited from 'doc' or the
  78. default parser will be used.
  79. If 'doc' is None, 'c_doc' is used to create a new _Document and the new
  80. element is made its root node.
  81. If 'c_doc' is also NULL, a new xmlDoc will be created.
  82. """
  83. cdef xmlNode* c_node
  84. if doc is not None:
  85. c_doc = doc._c_doc
  86. ns_utf, name_utf = _getNsTag(tag)
  87. if parser is not None and parser._for_html:
  88. _htmlTagValidOrRaise(name_utf)
  89. if c_doc is NULL:
  90. c_doc = _newHTMLDoc()
  91. else:
  92. _tagValidOrRaise(name_utf)
  93. if c_doc is NULL:
  94. c_doc = _newXMLDoc()
  95. c_node = _createElement(c_doc, name_utf)
  96. if c_node is NULL:
  97. if doc is None and c_doc is not NULL:
  98. tree.xmlFreeDoc(c_doc)
  99. raise MemoryError()
  100. try:
  101. if doc is None:
  102. tree.xmlDocSetRootElement(c_doc, c_node)
  103. doc = _documentFactory(c_doc, parser)
  104. if text is not None:
  105. _setNodeText(c_node, text)
  106. if tail is not None:
  107. _setTailText(c_node, tail)
  108. # add namespaces to node if necessary
  109. _setNodeNamespaces(c_node, doc, ns_utf, nsmap)
  110. _initNodeAttributes(c_node, doc, attrib, extra_attrs)
  111. return _elementFactory(doc, c_node)
  112. except:
  113. # free allocated c_node/c_doc unless Python does it for us
  114. if c_node.doc is not c_doc:
  115. # node not yet in document => will not be freed by document
  116. if tail is not None:
  117. _removeText(c_node.next) # tail
  118. tree.xmlFreeNode(c_node)
  119. if doc is None:
  120. # c_doc will not be freed by doc
  121. tree.xmlFreeDoc(c_doc)
  122. raise
  123. cdef int _initNewElement(_Element element, bint is_html, name_utf, ns_utf,
  124. _BaseParser parser, attrib, nsmap, dict extra_attrs) except -1:
  125. """Initialise a new Element object.
  126. This is used when users instantiate a Python Element subclass
  127. directly, without it being mapped to an existing XML node.
  128. """
  129. cdef xmlDoc* c_doc
  130. cdef xmlNode* c_node
  131. cdef _Document doc
  132. if is_html:
  133. _htmlTagValidOrRaise(name_utf)
  134. c_doc = _newHTMLDoc()
  135. else:
  136. _tagValidOrRaise(name_utf)
  137. c_doc = _newXMLDoc()
  138. c_node = _createElement(c_doc, name_utf)
  139. if c_node is NULL:
  140. if c_doc is not NULL:
  141. tree.xmlFreeDoc(c_doc)
  142. raise MemoryError()
  143. tree.xmlDocSetRootElement(c_doc, c_node)
  144. doc = _documentFactory(c_doc, parser)
  145. # add namespaces to node if necessary
  146. _setNodeNamespaces(c_node, doc, ns_utf, nsmap)
  147. _initNodeAttributes(c_node, doc, attrib, extra_attrs)
  148. _registerProxy(element, doc, c_node)
  149. element._init()
  150. return 0
  151. cdef _Element _makeSubElement(_Element parent, tag, text, tail,
  152. attrib, nsmap, dict extra_attrs):
  153. """Create a new child element and initialize text content, namespaces and
  154. attributes.
  155. """
  156. cdef xmlNode* c_node
  157. cdef xmlDoc* c_doc
  158. if parent is None or parent._doc is None:
  159. return None
  160. _assertValidNode(parent)
  161. ns_utf, name_utf = _getNsTag(tag)
  162. c_doc = parent._doc._c_doc
  163. if parent._doc._parser is not None and parent._doc._parser._for_html:
  164. _htmlTagValidOrRaise(name_utf)
  165. else:
  166. _tagValidOrRaise(name_utf)
  167. c_node = _createElement(c_doc, name_utf)
  168. if c_node is NULL:
  169. raise MemoryError()
  170. tree.xmlAddChild(parent._c_node, c_node)
  171. try:
  172. if text is not None:
  173. _setNodeText(c_node, text)
  174. if tail is not None:
  175. _setTailText(c_node, tail)
  176. # add namespaces to node if necessary
  177. _setNodeNamespaces(c_node, parent._doc, ns_utf, nsmap)
  178. _initNodeAttributes(c_node, parent._doc, attrib, extra_attrs)
  179. return _elementFactory(parent._doc, c_node)
  180. except:
  181. # make sure we clean up in case of an error
  182. _removeNode(parent._doc, c_node)
  183. raise
  184. cdef int _setNodeNamespaces(xmlNode* c_node, _Document doc,
  185. object node_ns_utf, object nsmap) except -1:
  186. """Lookup current namespace prefixes, then set namespace structure for
  187. node (if 'node_ns_utf' was provided) and register new ns-prefix mappings.
  188. 'node_ns_utf' should only be passed for a newly created node.
  189. """
  190. cdef xmlNs* c_ns
  191. cdef list nsdefs
  192. if nsmap:
  193. for prefix, href in _iter_nsmap(nsmap):
  194. href_utf = _utf8(href)
  195. _uriValidOrRaise(href_utf)
  196. c_href = _xcstr(href_utf)
  197. if prefix is not None:
  198. prefix_utf = _utf8(prefix)
  199. _prefixValidOrRaise(prefix_utf)
  200. c_prefix = _xcstr(prefix_utf)
  201. else:
  202. c_prefix = <const_xmlChar*>NULL
  203. # add namespace with prefix if it is not already known
  204. c_ns = tree.xmlSearchNs(doc._c_doc, c_node, c_prefix)
  205. if c_ns is NULL or \
  206. c_ns.href is NULL or \
  207. tree.xmlStrcmp(c_ns.href, c_href) != 0:
  208. c_ns = tree.xmlNewNs(c_node, c_href, c_prefix)
  209. if href_utf == node_ns_utf:
  210. tree.xmlSetNs(c_node, c_ns)
  211. node_ns_utf = None
  212. if node_ns_utf is not None:
  213. _uriValidOrRaise(node_ns_utf)
  214. doc._setNodeNs(c_node, _xcstr(node_ns_utf))
  215. return 0
  216. cdef dict _build_nsmap(xmlNode* c_node):
  217. """
  218. Namespace prefix->URI mapping known in the context of this Element.
  219. This includes all namespace declarations of the parents.
  220. """
  221. cdef xmlNs* c_ns
  222. nsmap = {}
  223. while c_node is not NULL and c_node.type == tree.XML_ELEMENT_NODE:
  224. c_ns = c_node.nsDef
  225. while c_ns is not NULL:
  226. if c_ns.prefix or c_ns.href:
  227. prefix = funicodeOrNone(c_ns.prefix)
  228. if prefix not in nsmap:
  229. nsmap[prefix] = funicodeOrNone(c_ns.href)
  230. c_ns = c_ns.next
  231. c_node = c_node.parent
  232. return nsmap
  233. cdef _iter_nsmap(nsmap):
  234. """
  235. Create a reproducibly ordered iterable from an nsmap mapping.
  236. Tries to preserve an existing order and sorts if it assumes no order.
  237. The difference to _iter_attrib() is that None doesn't sort with strings
  238. in Py3.x.
  239. """
  240. if isinstance(nsmap, dict):
  241. # dicts are insertion-ordered in Py3.6+ => keep the user provided order.
  242. return nsmap.items()
  243. if len(nsmap) <= 1:
  244. return nsmap.items()
  245. # nsmap will usually be a plain unordered dict => avoid type checking overhead
  246. if type(nsmap) is not dict and isinstance(nsmap, OrderedDict):
  247. return nsmap.items() # keep existing order
  248. if None not in nsmap:
  249. return sorted(nsmap.items())
  250. # Move the default namespace to the end. This makes sure libxml2
  251. # prefers a prefix if the ns is defined redundantly on the same
  252. # element. That way, users can work around a problem themselves
  253. # where default namespace attributes on non-default namespaced
  254. # elements serialise without prefix (i.e. into the non-default
  255. # namespace).
  256. default_ns = nsmap[None]
  257. nsdefs = [(k, v) for k, v in nsmap.items() if k is not None]
  258. nsdefs.sort()
  259. nsdefs.append((None, default_ns))
  260. return nsdefs
  261. cdef _iter_attrib(attrib):
  262. """
  263. Create a reproducibly ordered iterable from an attrib mapping.
  264. Tries to preserve an existing order and sorts if it assumes no order.
  265. """
  266. # dicts are insertion-ordered in Py3.6+ => keep the user provided order.
  267. if isinstance(attrib, (dict, _Attrib, OrderedDict)):
  268. return attrib.items()
  269. # assume it's an unordered mapping of some kind
  270. return sorted(attrib.items())
  271. cdef _initNodeAttributes(xmlNode* c_node, _Document doc, attrib, dict extra):
  272. """Initialise the attributes of an element node.
  273. """
  274. cdef bint is_html
  275. cdef xmlNs* c_ns
  276. if attrib is not None and not hasattr(attrib, 'items'):
  277. raise TypeError, f"Invalid attribute dictionary: {python._fqtypename(attrib).decode('utf8')}"
  278. if not attrib and not extra:
  279. return # nothing to do
  280. is_html = doc._parser._for_html
  281. seen = set()
  282. if extra:
  283. for name, value in extra.items():
  284. _addAttributeToNode(c_node, doc, is_html, name, value, seen)
  285. if attrib:
  286. for name, value in _iter_attrib(attrib):
  287. _addAttributeToNode(c_node, doc, is_html, name, value, seen)
  288. cdef int _addAttributeToNode(xmlNode* c_node, _Document doc, bint is_html,
  289. name, value, set seen_tags) except -1:
  290. ns_utf, name_utf = tag = _getNsTag(name)
  291. if tag in seen_tags:
  292. return 0
  293. seen_tags.add(tag)
  294. if not is_html:
  295. _attributeValidOrRaise(name_utf)
  296. value_utf = _utf8(value)
  297. if ns_utf is None:
  298. tree.xmlNewProp(c_node, _xcstr(name_utf), _xcstr(value_utf))
  299. else:
  300. _uriValidOrRaise(ns_utf)
  301. c_ns = doc._findOrBuildNodeNs(c_node, _xcstr(ns_utf), NULL, 1)
  302. tree.xmlNewNsProp(c_node, c_ns,
  303. _xcstr(name_utf), _xcstr(value_utf))
  304. return 0
  305. ctypedef struct _ns_node_ref:
  306. xmlNs* ns
  307. xmlNode* node
  308. cdef int _collectNsDefs(xmlNode* c_element, _ns_node_ref **_c_ns_list,
  309. size_t *_c_ns_list_len, size_t *_c_ns_list_size) except -1:
  310. c_ns_list = _c_ns_list[0]
  311. cdef size_t c_ns_list_len = _c_ns_list_len[0]
  312. cdef size_t c_ns_list_size = _c_ns_list_size[0]
  313. c_nsdef = c_element.nsDef
  314. while c_nsdef is not NULL:
  315. if c_ns_list_len >= c_ns_list_size:
  316. if c_ns_list is NULL:
  317. c_ns_list_size = 20
  318. else:
  319. c_ns_list_size *= 2
  320. c_nsref_ptr = <_ns_node_ref*> python.lxml_realloc(
  321. c_ns_list, c_ns_list_size, sizeof(_ns_node_ref))
  322. if c_nsref_ptr is NULL:
  323. if c_ns_list is not NULL:
  324. python.lxml_free(c_ns_list)
  325. _c_ns_list[0] = NULL
  326. raise MemoryError()
  327. c_ns_list = c_nsref_ptr
  328. c_ns_list[c_ns_list_len] = _ns_node_ref(c_nsdef, c_element)
  329. c_ns_list_len += 1
  330. c_nsdef = c_nsdef.next
  331. _c_ns_list_size[0] = c_ns_list_size
  332. _c_ns_list_len[0] = c_ns_list_len
  333. _c_ns_list[0] = c_ns_list
  334. cdef int _removeUnusedNamespaceDeclarations(xmlNode* c_element, set prefixes_to_keep) except -1:
  335. """Remove any namespace declarations from a subtree that are not used by
  336. any of its elements (or attributes).
  337. If a 'prefixes_to_keep' is provided, it must be a set of prefixes.
  338. Any corresponding namespace mappings will not be removed as part of the cleanup.
  339. """
  340. cdef xmlNode* c_node
  341. cdef _ns_node_ref* c_ns_list = NULL
  342. cdef size_t c_ns_list_size = 0
  343. cdef size_t c_ns_list_len = 0
  344. cdef size_t i
  345. if c_element.parent and c_element.parent.type == tree.XML_DOCUMENT_NODE:
  346. # include declarations on the document node
  347. _collectNsDefs(c_element.parent, &c_ns_list, &c_ns_list_len, &c_ns_list_size)
  348. tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_element, c_element, 1)
  349. # collect all new namespace declarations into the ns list
  350. if c_element.nsDef:
  351. _collectNsDefs(c_element, &c_ns_list, &c_ns_list_len, &c_ns_list_size)
  352. # remove all namespace declarations from the list that are referenced
  353. if c_ns_list_len and c_element.type == tree.XML_ELEMENT_NODE:
  354. c_node = c_element
  355. while c_node and c_ns_list_len:
  356. if c_node.ns:
  357. for i in range(c_ns_list_len):
  358. if c_node.ns is c_ns_list[i].ns:
  359. c_ns_list_len -= 1
  360. c_ns_list[i] = c_ns_list[c_ns_list_len]
  361. #c_ns_list[c_ns_list_len] = _ns_node_ref(NULL, NULL)
  362. break
  363. if c_node is c_element:
  364. # continue with attributes
  365. c_node = <xmlNode*>c_element.properties
  366. else:
  367. c_node = c_node.next
  368. tree.END_FOR_EACH_ELEMENT_FROM(c_element)
  369. if c_ns_list is NULL:
  370. return 0
  371. # free all namespace declarations that remained in the list,
  372. # except for those we should keep explicitly
  373. cdef xmlNs* c_nsdef
  374. for i in range(c_ns_list_len):
  375. if prefixes_to_keep is not None:
  376. if c_ns_list[i].ns.prefix and c_ns_list[i].ns.prefix in prefixes_to_keep:
  377. continue
  378. c_node = c_ns_list[i].node
  379. c_nsdef = c_node.nsDef
  380. if c_nsdef is c_ns_list[i].ns:
  381. c_node.nsDef = c_node.nsDef.next
  382. else:
  383. while c_nsdef.next is not c_ns_list[i].ns:
  384. c_nsdef = c_nsdef.next
  385. c_nsdef.next = c_nsdef.next.next
  386. tree.xmlFreeNs(c_ns_list[i].ns)
  387. if c_ns_list is not NULL:
  388. python.lxml_free(c_ns_list)
  389. return 0
  390. cdef xmlNs* _searchNsByHref(xmlNode* c_node, const_xmlChar* c_href, bint is_attribute) noexcept:
  391. """Search a namespace declaration that covers a node (element or
  392. attribute).
  393. For attributes, try to find a prefixed namespace declaration
  394. instead of the default namespaces. This helps in supporting
  395. round-trips for attributes on elements with a different namespace.
  396. """
  397. cdef xmlNs* c_ns
  398. cdef xmlNs* c_default_ns = NULL
  399. cdef xmlNode* c_element
  400. if c_href is NULL or c_node is NULL or c_node.type == tree.XML_ENTITY_REF_NODE:
  401. return NULL
  402. if tree.xmlStrcmp(c_href, tree.XML_XML_NAMESPACE) == 0:
  403. # no special cases here, let libxml2 handle this
  404. return tree.xmlSearchNsByHref(c_node.doc, c_node, c_href)
  405. if c_node.type == tree.XML_ATTRIBUTE_NODE:
  406. is_attribute = 1
  407. while c_node is not NULL and c_node.type != tree.XML_ELEMENT_NODE:
  408. c_node = c_node.parent
  409. c_element = c_node
  410. while c_node is not NULL:
  411. if c_node.type == tree.XML_ELEMENT_NODE:
  412. c_ns = c_node.nsDef
  413. while c_ns is not NULL:
  414. if c_ns.href is not NULL and tree.xmlStrcmp(c_href, c_ns.href) == 0:
  415. if c_ns.prefix is NULL and is_attribute:
  416. # for attributes, continue searching a named
  417. # prefix, but keep the first default namespace
  418. # declaration that we found
  419. if c_default_ns is NULL:
  420. c_default_ns = c_ns
  421. elif tree.xmlSearchNs(
  422. c_element.doc, c_element, c_ns.prefix) is c_ns:
  423. # start node is in namespace scope => found!
  424. return c_ns
  425. c_ns = c_ns.next
  426. if c_node is not c_element and c_node.ns is not NULL:
  427. # optimise: the node may have the namespace itself
  428. c_ns = c_node.ns
  429. if c_ns.href is not NULL and tree.xmlStrcmp(c_href, c_ns.href) == 0:
  430. if c_ns.prefix is NULL and is_attribute:
  431. # for attributes, continue searching a named
  432. # prefix, but keep the first default namespace
  433. # declaration that we found
  434. if c_default_ns is NULL:
  435. c_default_ns = c_ns
  436. elif tree.xmlSearchNs(
  437. c_element.doc, c_element, c_ns.prefix) is c_ns:
  438. # start node is in namespace scope => found!
  439. return c_ns
  440. c_node = c_node.parent
  441. # nothing found => use a matching default namespace or fail
  442. if c_default_ns is not NULL:
  443. if tree.xmlSearchNs(c_element.doc, c_element, NULL) is c_default_ns:
  444. return c_default_ns
  445. return NULL
  446. cdef int _replaceNodeByChildren(_Document doc, xmlNode* c_node) except -1:
  447. # NOTE: this does not deallocate the node, just unlink it!
  448. cdef xmlNode* c_parent
  449. cdef xmlNode* c_child
  450. if c_node.children is NULL:
  451. tree.xmlUnlinkNode(c_node)
  452. return 0
  453. c_parent = c_node.parent
  454. # fix parent links of children
  455. c_child = c_node.children
  456. while c_child is not NULL:
  457. c_child.parent = c_parent
  458. c_child = c_child.next
  459. # fix namespace references of children if their parent's namespace
  460. # declarations get lost
  461. if c_node.nsDef is not NULL:
  462. c_child = c_node.children
  463. while c_child is not NULL:
  464. moveNodeToDocument(doc, doc._c_doc, c_child)
  465. c_child = c_child.next
  466. # fix sibling links to/from child slice
  467. if c_node.prev is NULL:
  468. c_parent.children = c_node.children
  469. else:
  470. c_node.prev.next = c_node.children
  471. c_node.children.prev = c_node.prev
  472. if c_node.next is NULL:
  473. c_parent.last = c_node.last
  474. else:
  475. c_node.next.prev = c_node.last
  476. c_node.last.next = c_node.next
  477. # unlink c_node
  478. c_node.children = c_node.last = NULL
  479. c_node.parent = c_node.next = c_node.prev = NULL
  480. return 0
  481. cdef unicode _attributeValue(xmlNode* c_element, xmlAttr* c_attrib_node):
  482. c_href = _getNs(<xmlNode*>c_attrib_node)
  483. value = tree.xmlGetNsProp(c_element, c_attrib_node.name, c_href)
  484. try:
  485. result = funicode(value)
  486. finally:
  487. tree.xmlFree(value)
  488. return result
  489. cdef unicode _attributeValueFromNsName(xmlNode* c_element,
  490. const_xmlChar* c_href, const_xmlChar* c_name):
  491. c_result = tree.xmlGetNsProp(c_element, c_name, c_href)
  492. if c_result is NULL:
  493. return None
  494. try:
  495. result = funicode(c_result)
  496. finally:
  497. tree.xmlFree(c_result)
  498. return result
  499. cdef object _getNodeAttributeValue(xmlNode* c_node, key, default):
  500. ns, tag = _getNsTag(key)
  501. c_href = <const_xmlChar*>NULL if ns is None else _xcstr(ns)
  502. c_result = tree.xmlGetNsProp(c_node, _xcstr(tag), c_href)
  503. if c_result is NULL:
  504. # XXX free namespace that is not in use..?
  505. return default
  506. try:
  507. result = funicode(c_result)
  508. finally:
  509. tree.xmlFree(c_result)
  510. return result
  511. cdef inline object _getAttributeValue(_Element element, key, default):
  512. return _getNodeAttributeValue(element._c_node, key, default)
  513. cdef int _setAttributeValue(_Element element, key, value) except -1:
  514. cdef const_xmlChar* c_value
  515. cdef xmlNs* c_ns
  516. ns, tag = _getNsTag(key)
  517. is_html = element._doc._parser._for_html
  518. if not is_html:
  519. _attributeValidOrRaise(tag)
  520. c_tag = _xcstr(tag)
  521. if value is None and is_html:
  522. c_value = NULL
  523. else:
  524. if isinstance(value, QName):
  525. value = _resolveQNameText(element, value)
  526. else:
  527. value = _utf8(value)
  528. c_value = _xcstr(value)
  529. if ns is None:
  530. c_ns = NULL
  531. else:
  532. c_ns = element._doc._findOrBuildNodeNs(element._c_node, _xcstr(ns), NULL, 1)
  533. tree.xmlSetNsProp(element._c_node, c_ns, c_tag, c_value)
  534. return 0
  535. cdef int _delAttribute(_Element element, key) except -1:
  536. ns, tag = _getNsTag(key)
  537. c_href = <const_xmlChar*>NULL if ns is None else _xcstr(ns)
  538. if _delAttributeFromNsName(element._c_node, c_href, _xcstr(tag)):
  539. raise KeyError, key
  540. return 0
  541. cdef int _delAttributeFromNsName(xmlNode* c_node, const_xmlChar* c_href, const_xmlChar* c_name) noexcept:
  542. c_attr = tree.xmlHasNsProp(c_node, c_name, c_href)
  543. if c_attr is NULL:
  544. # XXX free namespace that is not in use..?
  545. return -1
  546. tree.xmlRemoveProp(c_attr)
  547. return 0
  548. cdef list _collectAttributes(xmlNode* c_node, int collecttype):
  549. """Collect all attributes of a node in a list. Depending on collecttype,
  550. it collects either the name (1), the value (2) or the name-value tuples.
  551. """
  552. cdef Py_ssize_t count
  553. c_attr = c_node.properties
  554. count = 0
  555. while c_attr is not NULL:
  556. if c_attr.type == tree.XML_ATTRIBUTE_NODE:
  557. count += 1
  558. c_attr = c_attr.next
  559. if not count:
  560. return []
  561. attributes = [None] * count
  562. c_attr = c_node.properties
  563. count = 0
  564. while c_attr is not NULL:
  565. if c_attr.type == tree.XML_ATTRIBUTE_NODE:
  566. if collecttype == 1:
  567. item = _namespacedName(<xmlNode*>c_attr)
  568. elif collecttype == 2:
  569. item = _attributeValue(c_node, c_attr)
  570. else:
  571. item = (_namespacedName(<xmlNode*>c_attr),
  572. _attributeValue(c_node, c_attr))
  573. attributes[count] = item
  574. count += 1
  575. c_attr = c_attr.next
  576. return attributes
  577. cdef object __RE_XML_ENCODING = re.compile(
  578. r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
  579. cdef object __REPLACE_XML_ENCODING = __RE_XML_ENCODING.sub
  580. cdef object __HAS_XML_ENCODING = __RE_XML_ENCODING.match
  581. cdef object _stripEncodingDeclaration(object xml_string):
  582. # this is a hack to remove the XML encoding declaration from unicode
  583. return __REPLACE_XML_ENCODING(r'\g<1>\g<2>', xml_string)
  584. cdef bint _hasEncodingDeclaration(object xml_string) except -1:
  585. # check if a (unicode) string has an XML encoding declaration
  586. return __HAS_XML_ENCODING(xml_string) is not None
  587. cdef inline bint _hasText(xmlNode* c_node) noexcept:
  588. return c_node is not NULL and _textNodeOrSkip(c_node.children) is not NULL
  589. cdef inline bint _hasTail(xmlNode* c_node) noexcept:
  590. return c_node is not NULL and _textNodeOrSkip(c_node.next) is not NULL
  591. cdef inline bint _hasNonWhitespaceTail(xmlNode* c_node) except -1:
  592. return _hasNonWhitespaceText(c_node, tail=True)
  593. cdef bint _hasNonWhitespaceText(xmlNode* c_node, bint tail=False) except -1:
  594. c_text_node = c_node and _textNodeOrSkip(c_node.next if tail else c_node.children)
  595. if c_text_node is NULL:
  596. return False
  597. while c_text_node is not NULL:
  598. if c_text_node.content[0] != c'\0' and not _collectText(c_text_node).isspace():
  599. return True
  600. c_text_node = _textNodeOrSkip(c_text_node.next)
  601. return False
  602. cdef unicode _collectText(xmlNode* c_node):
  603. """Collect all text nodes and return them as a unicode string.
  604. Start collecting at c_node.
  605. If there was no text to collect, return None
  606. """
  607. cdef Py_ssize_t scount
  608. cdef xmlChar* c_text
  609. cdef xmlNode* c_node_cur
  610. # check for multiple text nodes
  611. scount = 0
  612. c_text = NULL
  613. c_node_cur = c_node = _textNodeOrSkip(c_node)
  614. while c_node_cur is not NULL:
  615. if c_node_cur.content[0] != c'\0':
  616. c_text = c_node_cur.content
  617. scount += 1
  618. c_node_cur = _textNodeOrSkip(c_node_cur.next)
  619. # handle two most common cases first
  620. if c_text is NULL:
  621. return '' if scount > 0 else None
  622. if scount == 1:
  623. return funicode(c_text)
  624. # the rest is not performance critical anymore
  625. result = b''
  626. while c_node is not NULL:
  627. result += <unsigned char*>c_node.content
  628. c_node = _textNodeOrSkip(c_node.next)
  629. return funicode(<const_xmlChar*><unsigned char*>result)
  630. cdef void _removeText(xmlNode* c_node) noexcept:
  631. """Remove all text nodes.
  632. Start removing at c_node.
  633. """
  634. cdef xmlNode* c_next
  635. c_node = _textNodeOrSkip(c_node)
  636. while c_node is not NULL:
  637. c_next = _textNodeOrSkip(c_node.next)
  638. tree.xmlUnlinkNode(c_node)
  639. tree.xmlFreeNode(c_node)
  640. c_node = c_next
  641. cdef xmlNode* _createTextNode(xmlDoc* doc, value) except NULL:
  642. cdef xmlNode* c_text_node
  643. if isinstance(value, CDATA):
  644. c_text_node = tree.xmlNewCDataBlock(
  645. doc, _xcstr((<CDATA>value)._utf8_data),
  646. python.PyBytes_GET_SIZE((<CDATA>value)._utf8_data))
  647. else:
  648. text = _utf8(value)
  649. c_text_node = tree.xmlNewDocText(doc, _xcstr(text))
  650. if not c_text_node:
  651. raise MemoryError()
  652. return c_text_node
  653. cdef int _setNodeText(xmlNode* c_node, value) except -1:
  654. # remove all text nodes at the start first
  655. _removeText(c_node.children)
  656. if value is None:
  657. return 0
  658. # now add new text node with value at start
  659. c_text_node = _createTextNode(c_node.doc, value)
  660. if c_node.children is NULL:
  661. tree.xmlAddChild(c_node, c_text_node)
  662. else:
  663. tree.xmlAddPrevSibling(c_node.children, c_text_node)
  664. return 0
  665. cdef int _setTailText(xmlNode* c_node, value) except -1:
  666. # remove all text nodes at the start first
  667. _removeText(c_node.next)
  668. if value is None:
  669. return 0
  670. # now append new text node with value
  671. c_text_node = _createTextNode(c_node.doc, value)
  672. tree.xmlAddNextSibling(c_node, c_text_node)
  673. return 0
  674. cdef bytes _resolveQNameText(_Element element, value):
  675. cdef xmlNs* c_ns
  676. ns, tag = _getNsTag(value)
  677. if ns is None:
  678. return tag
  679. else:
  680. c_ns = element._doc._findOrBuildNodeNs(
  681. element._c_node, _xcstr(ns), NULL, 0)
  682. return python.PyBytes_FromFormat('%s:%s', c_ns.prefix, _cstr(tag))
  683. cdef inline bint _hasChild(xmlNode* c_node) noexcept:
  684. return c_node is not NULL and _findChildForwards(c_node, 0) is not NULL
  685. cdef inline Py_ssize_t _countElements(xmlNode* c_node) noexcept:
  686. "Counts the elements within the following siblings and the node itself."
  687. cdef Py_ssize_t count
  688. count = 0
  689. while c_node is not NULL:
  690. if _isElement(c_node):
  691. count += 1
  692. c_node = c_node.next
  693. return count
  694. cdef int _findChildSlice(
  695. slice sliceobject, xmlNode* c_parent,
  696. xmlNode** c_start_node, Py_ssize_t* c_step, Py_ssize_t* c_length) except -1:
  697. """Resolve a children slice.
  698. Returns the start node, step size and the slice length in the
  699. pointer arguments.
  700. """
  701. cdef Py_ssize_t start = 0, stop = 0, childcount
  702. childcount = _countElements(c_parent.children)
  703. if childcount == 0:
  704. c_start_node[0] = NULL
  705. c_length[0] = 0
  706. if sliceobject.step is None:
  707. c_step[0] = 1
  708. else:
  709. python._PyEval_SliceIndex(sliceobject.step, c_step)
  710. return 0
  711. python.PySlice_GetIndicesEx(
  712. sliceobject, childcount, &start, &stop, c_step, c_length)
  713. if start > childcount // 2:
  714. c_start_node[0] = _findChildBackwards(c_parent, childcount - start - 1)
  715. else:
  716. c_start_node[0] = _findChild(c_parent, start)
  717. return 0
  718. cdef bint _isFullSlice(slice sliceobject) except -1:
  719. """Conservative guess if this slice is a full slice as in ``s[:]``.
  720. """
  721. cdef Py_ssize_t step = 0
  722. if sliceobject is None:
  723. return 0
  724. if sliceobject.start is None and \
  725. sliceobject.stop is None:
  726. if sliceobject.step is None:
  727. return 1
  728. python._PyEval_SliceIndex(sliceobject.step, &step)
  729. if step == 1:
  730. return 1
  731. return 0
  732. return 0
  733. cdef _collectChildren(_Element element):
  734. cdef xmlNode* c_node
  735. cdef list result = []
  736. c_node = element._c_node.children
  737. if c_node is not NULL:
  738. if not _isElement(c_node):
  739. c_node = _nextElement(c_node)
  740. while c_node is not NULL:
  741. result.append(_elementFactory(element._doc, c_node))
  742. c_node = _nextElement(c_node)
  743. return result
  744. cdef inline xmlNode* _findChild(xmlNode* c_node, Py_ssize_t index) noexcept:
  745. if index < 0:
  746. return _findChildBackwards(c_node, -index - 1)
  747. else:
  748. return _findChildForwards(c_node, index)
  749. cdef inline xmlNode* _findChildForwards(xmlNode* c_node, Py_ssize_t index) noexcept:
  750. """Return child element of c_node with index, or return NULL if not found.
  751. """
  752. cdef xmlNode* c_child
  753. cdef Py_ssize_t c
  754. c_child = c_node.children
  755. c = 0
  756. while c_child is not NULL:
  757. if _isElement(c_child):
  758. if c == index:
  759. return c_child
  760. c += 1
  761. c_child = c_child.next
  762. return NULL
  763. cdef inline xmlNode* _findChildBackwards(xmlNode* c_node, Py_ssize_t index) noexcept:
  764. """Return child element of c_node with index, or return NULL if not found.
  765. Search from the end.
  766. """
  767. cdef xmlNode* c_child
  768. cdef Py_ssize_t c
  769. c_child = c_node.last
  770. c = 0
  771. while c_child is not NULL:
  772. if _isElement(c_child):
  773. if c == index:
  774. return c_child
  775. c += 1
  776. c_child = c_child.prev
  777. return NULL
  778. cdef inline xmlNode* _textNodeOrSkip(xmlNode* c_node) noexcept nogil:
  779. """Return the node if it's a text node. Skip over ignorable nodes in a
  780. series of text nodes. Return NULL if a non-ignorable node is found.
  781. This is used to skip over XInclude nodes when collecting adjacent text
  782. nodes.
  783. """
  784. while c_node is not NULL:
  785. if c_node.type == tree.XML_TEXT_NODE or \
  786. c_node.type == tree.XML_CDATA_SECTION_NODE:
  787. return c_node
  788. elif c_node.type == tree.XML_XINCLUDE_START or \
  789. c_node.type == tree.XML_XINCLUDE_END:
  790. c_node = c_node.next
  791. else:
  792. return NULL
  793. return NULL
  794. cdef inline xmlNode* _nextElement(xmlNode* c_node) noexcept:
  795. """Given a node, find the next sibling that is an element.
  796. """
  797. if c_node is NULL:
  798. return NULL
  799. c_node = c_node.next
  800. while c_node is not NULL:
  801. if _isElement(c_node):
  802. return c_node
  803. c_node = c_node.next
  804. return NULL
  805. cdef inline xmlNode* _previousElement(xmlNode* c_node) noexcept:
  806. """Given a node, find the next sibling that is an element.
  807. """
  808. if c_node is NULL:
  809. return NULL
  810. c_node = c_node.prev
  811. while c_node is not NULL:
  812. if _isElement(c_node):
  813. return c_node
  814. c_node = c_node.prev
  815. return NULL
  816. cdef inline xmlNode* _parentElement(xmlNode* c_node) noexcept:
  817. "Given a node, find the parent element."
  818. if c_node is NULL or not _isElement(c_node):
  819. return NULL
  820. c_node = c_node.parent
  821. if c_node is NULL or not _isElement(c_node):
  822. return NULL
  823. return c_node
  824. cdef inline bint _tagMatches(xmlNode* c_node, const_xmlChar* c_href, const_xmlChar* c_name) noexcept:
  825. """Tests if the node matches namespace URI and tag name.
  826. A node matches if it matches both c_href and c_name.
  827. A node matches c_href if any of the following is true:
  828. * c_href is NULL
  829. * its namespace is NULL and c_href is the empty string
  830. * its namespace string equals the c_href string
  831. A node matches c_name if any of the following is true:
  832. * c_name is NULL
  833. * its name string equals the c_name string
  834. """
  835. if c_node is NULL:
  836. return 0
  837. if c_node.type != tree.XML_ELEMENT_NODE:
  838. # not an element, only succeed if we match everything
  839. return c_name is NULL and c_href is NULL
  840. if c_name is NULL:
  841. if c_href is NULL:
  842. # always match
  843. return 1
  844. else:
  845. c_node_href = _getNs(c_node)
  846. if c_node_href is NULL:
  847. return c_href[0] == c'\0'
  848. else:
  849. return tree.xmlStrcmp(c_node_href, c_href) == 0
  850. elif c_href is NULL:
  851. if _getNs(c_node) is not NULL:
  852. return 0
  853. return c_node.name == c_name or tree.xmlStrcmp(c_node.name, c_name) == 0
  854. elif c_node.name == c_name or tree.xmlStrcmp(c_node.name, c_name) == 0:
  855. c_node_href = _getNs(c_node)
  856. if c_node_href is NULL:
  857. return c_href[0] == c'\0'
  858. else:
  859. return tree.xmlStrcmp(c_node_href, c_href) == 0
  860. else:
  861. return 0
  862. cdef inline bint _tagMatchesExactly(xmlNode* c_node, qname* c_qname) noexcept:
  863. """Tests if the node matches namespace URI and tag name.
  864. This differs from _tagMatches() in that it does not consider a
  865. NULL value in qname.href a wildcard, and that it expects the c_name
  866. to be taken from the doc dict, i.e. it only compares the names by
  867. address.
  868. A node matches if it matches both href and c_name of the qname.
  869. A node matches c_href if any of the following is true:
  870. * its namespace is NULL and c_href is the empty string
  871. * its namespace string equals the c_href string
  872. A node matches c_name if any of the following is true:
  873. * c_name is NULL
  874. * its name string points to the same address (!) as c_name
  875. """
  876. return _nsTagMatchesExactly(_getNs(c_node), c_node.name, c_qname)
  877. cdef inline bint _nsTagMatchesExactly(const_xmlChar* c_node_href,
  878. const_xmlChar* c_node_name,
  879. qname* c_qname) noexcept:
  880. """Tests if name and namespace URI match those of c_qname.
  881. This differs from _tagMatches() in that it does not consider a
  882. NULL value in qname.href a wildcard, and that it expects the c_name
  883. to be taken from the doc dict, i.e. it only compares the names by
  884. address.
  885. A node matches if it matches both href and c_name of the qname.
  886. A node matches c_href if any of the following is true:
  887. * its namespace is NULL and c_href is the empty string
  888. * its namespace string equals the c_href string
  889. A node matches c_name if any of the following is true:
  890. * c_name is NULL
  891. * its name string points to the same address (!) as c_name
  892. """
  893. cdef char* c_href
  894. if c_qname.c_name is not NULL and c_qname.c_name is not c_node_name:
  895. return 0
  896. if c_qname.href is NULL:
  897. return 1
  898. c_href = python.__cstr(c_qname.href)
  899. if c_href[0] == b'\0':
  900. return c_node_href is NULL or c_node_href[0] == b'\0'
  901. elif c_node_href is NULL:
  902. return 0
  903. else:
  904. return tree.xmlStrcmp(<const_xmlChar*>c_href, c_node_href) == 0
  905. cdef Py_ssize_t _mapTagsToQnameMatchArray(xmlDoc* c_doc, list ns_tags,
  906. qname* c_ns_tags, bint force_into_dict) except -1:
  907. """Map a sequence of (name, namespace) pairs to a qname array for efficient
  908. matching with _tagMatchesExactly() above.
  909. Note that each qname struct in the array owns its href byte string object
  910. if it is not NULL.
  911. """
  912. cdef Py_ssize_t count = 0, i, c_tag_len
  913. cdef bytes ns, tag
  914. cdef const_xmlChar* c_tag
  915. for ns, tag in ns_tags:
  916. if tag is None:
  917. c_tag = <const_xmlChar*> NULL
  918. else:
  919. c_tag_len = len(tag)
  920. if c_tag_len > limits.INT_MAX:
  921. # too long, not in the dict => not in the document
  922. continue
  923. elif force_into_dict:
  924. c_tag = tree.xmlDictLookup(c_doc.dict, _xcstr(tag), <int> c_tag_len)
  925. if c_tag is NULL:
  926. # clean up before raising the error
  927. for i in xrange(count):
  928. cpython.ref.Py_XDECREF(c_ns_tags[i].href)
  929. raise MemoryError()
  930. else:
  931. c_tag = tree.xmlDictExists(c_doc.dict, _xcstr(tag), <int> c_tag_len)
  932. if c_tag is NULL:
  933. # not in the dict => not in the document
  934. continue
  935. c_ns_tags[count].c_name = c_tag
  936. if ns is None:
  937. c_ns_tags[count].href = NULL
  938. else:
  939. cpython.ref.Py_INCREF(ns) # keep an owned reference!
  940. c_ns_tags[count].href = <python.PyObject*>ns
  941. count += 1
  942. return count
  943. cdef int _removeNode(_Document doc, xmlNode* c_node) except -1:
  944. """Unlink and free a node and subnodes if possible. Otherwise, make sure
  945. it's self-contained.
  946. """
  947. cdef xmlNode* c_next
  948. c_next = c_node.next
  949. tree.xmlUnlinkNode(c_node)
  950. _moveTail(c_next, c_node)
  951. if not attemptDeallocation(c_node):
  952. # make namespaces absolute
  953. moveNodeToDocument(doc, c_node.doc, c_node)
  954. return 0
  955. cdef int _removeSiblings(xmlNode* c_element, tree.xmlElementType node_type, bint with_tail) except -1:
  956. cdef xmlNode* c_node
  957. cdef xmlNode* c_next
  958. c_node = c_element.next
  959. while c_node is not NULL:
  960. c_next = _nextElement(c_node)
  961. if c_node.type == node_type:
  962. if with_tail:
  963. _removeText(c_node.next)
  964. tree.xmlUnlinkNode(c_node)
  965. attemptDeallocation(c_node)
  966. c_node = c_next
  967. c_node = c_element.prev
  968. while c_node is not NULL:
  969. c_next = _previousElement(c_node)
  970. if c_node.type == node_type:
  971. if with_tail:
  972. _removeText(c_node.next)
  973. tree.xmlUnlinkNode(c_node)
  974. attemptDeallocation(c_node)
  975. c_node = c_next
  976. return 0
  977. cdef void _moveTail(xmlNode* c_tail, xmlNode* c_target) noexcept:
  978. cdef xmlNode* c_next
  979. # tail support: look for any text nodes trailing this node and
  980. # move them too
  981. c_tail = _textNodeOrSkip(c_tail)
  982. while c_tail is not NULL:
  983. c_next = _textNodeOrSkip(c_tail.next)
  984. c_target = tree.xmlAddNextSibling(c_target, c_tail)
  985. c_tail = c_next
  986. cdef int _copyTail(xmlNode* c_tail, xmlNode* c_target) except -1:
  987. cdef xmlNode* c_new_tail
  988. # tail copying support: look for any text nodes trailing this node and
  989. # copy it to the target node
  990. c_tail = _textNodeOrSkip(c_tail)
  991. while c_tail is not NULL:
  992. if c_target.doc is not c_tail.doc:
  993. c_new_tail = tree.xmlDocCopyNode(c_tail, c_target.doc, 0)
  994. else:
  995. c_new_tail = tree.xmlCopyNode(c_tail, 0)
  996. if c_new_tail is NULL:
  997. raise MemoryError()
  998. c_target = tree.xmlAddNextSibling(c_target, c_new_tail)
  999. c_tail = _textNodeOrSkip(c_tail.next)
  1000. return 0
  1001. cdef int _copyNonElementSiblings(xmlNode* c_node, xmlNode* c_target) except -1:
  1002. cdef xmlNode* c_copy
  1003. cdef xmlNode* c_sibling = c_node
  1004. while c_sibling.prev != NULL and \
  1005. (c_sibling.prev.type == tree.XML_PI_NODE or
  1006. c_sibling.prev.type == tree.XML_COMMENT_NODE or
  1007. c_sibling.prev.type == tree.XML_DTD_NODE):
  1008. c_sibling = c_sibling.prev
  1009. while c_sibling != c_node:
  1010. if c_sibling.type == tree.XML_DTD_NODE:
  1011. c_copy = <xmlNode*>_copyDtd(<tree.xmlDtd*>c_sibling)
  1012. if c_sibling == <xmlNode*>c_node.doc.intSubset:
  1013. c_target.doc.intSubset = <tree.xmlDtd*>c_copy
  1014. else: # c_sibling == c_node.doc.extSubset
  1015. c_target.doc.extSubset = <tree.xmlDtd*>c_copy
  1016. else:
  1017. c_copy = tree.xmlDocCopyNode(c_sibling, c_target.doc, 1)
  1018. if c_copy is NULL:
  1019. raise MemoryError()
  1020. tree.xmlAddPrevSibling(c_target, c_copy)
  1021. c_sibling = c_sibling.next
  1022. while c_sibling.next != NULL and \
  1023. (c_sibling.next.type == tree.XML_PI_NODE or
  1024. c_sibling.next.type == tree.XML_COMMENT_NODE):
  1025. c_sibling = c_sibling.next
  1026. c_copy = tree.xmlDocCopyNode(c_sibling, c_target.doc, 1)
  1027. if c_copy is NULL:
  1028. raise MemoryError()
  1029. tree.xmlAddNextSibling(c_target, c_copy)
  1030. cdef int _deleteSlice(_Document doc, xmlNode* c_node,
  1031. Py_ssize_t count, Py_ssize_t step) except -1:
  1032. """Delete slice, ``count`` items starting with ``c_node`` with a step
  1033. width of ``step``.
  1034. """
  1035. cdef xmlNode* c_next
  1036. cdef Py_ssize_t c, i
  1037. cdef _node_to_node_function next_element
  1038. if c_node is NULL:
  1039. return 0
  1040. if step > 0:
  1041. next_element = _nextElement
  1042. else:
  1043. step = -step
  1044. next_element = _previousElement
  1045. # now start deleting nodes
  1046. c = 0
  1047. c_next = c_node
  1048. while c_node is not NULL and c < count:
  1049. for i in range(step):
  1050. c_next = next_element(c_next)
  1051. if c_next is NULL:
  1052. break
  1053. _removeNode(doc, c_node)
  1054. c += 1
  1055. c_node = c_next
  1056. return 0
  1057. cdef int _replaceSlice(_Element parent, xmlNode* c_node,
  1058. Py_ssize_t slicelength, Py_ssize_t step,
  1059. bint left_to_right, elements) except -1:
  1060. """Replace the slice of ``count`` elements starting at ``c_node`` with
  1061. positive step width ``step`` by the Elements in ``elements``. The
  1062. direction is given by the boolean argument ``left_to_right``.
  1063. ``c_node`` may be NULL to indicate the end of the children list.
  1064. """
  1065. cdef xmlNode* c_orig_neighbour
  1066. cdef xmlNode* c_next
  1067. cdef xmlDoc* c_source_doc
  1068. cdef _Element element
  1069. cdef Py_ssize_t seqlength, i, c
  1070. cdef _node_to_node_function next_element
  1071. assert step > 0
  1072. if left_to_right:
  1073. next_element = _nextElement
  1074. else:
  1075. next_element = _previousElement
  1076. if not isinstance(elements, (list, tuple)):
  1077. elements = list(elements)
  1078. if step != 1 or not left_to_right:
  1079. # *replacing* children stepwise with list => check size!
  1080. seqlength = len(elements)
  1081. if seqlength != slicelength:
  1082. raise ValueError, f"attempt to assign sequence of size {seqlength} " \
  1083. f"to extended slice of size {slicelength}"
  1084. if c_node is NULL:
  1085. # no children yet => add all elements straight away
  1086. if left_to_right:
  1087. for element in elements:
  1088. assert element is not None, "Node must not be None"
  1089. _appendChild(parent, element)
  1090. else:
  1091. for element in elements:
  1092. assert element is not None, "Node must not be None"
  1093. _prependChild(parent, element)
  1094. return 0
  1095. # remove the elements first as some might be re-added
  1096. if left_to_right:
  1097. # L->R, remember left neighbour
  1098. c_orig_neighbour = _previousElement(c_node)
  1099. else:
  1100. # R->L, remember right neighbour
  1101. c_orig_neighbour = _nextElement(c_node)
  1102. # We remove the original slice elements one by one. Since we hold
  1103. # a Python reference to all elements that we will insert, it is
  1104. # safe to let _removeNode() try (and fail) to free them even if
  1105. # the element itself or one of its descendents will be reinserted.
  1106. c = 0
  1107. c_next = c_node
  1108. while c_node is not NULL and c < slicelength:
  1109. for i in range(step):
  1110. c_next = next_element(c_next)
  1111. if c_next is NULL:
  1112. break
  1113. _removeNode(parent._doc, c_node)
  1114. c += 1
  1115. c_node = c_next
  1116. # make sure each element is inserted only once
  1117. elements = iter(elements)
  1118. # find the first node right of the new insertion point
  1119. if left_to_right:
  1120. if c_orig_neighbour is not NULL:
  1121. c_node = next_element(c_orig_neighbour)
  1122. else:
  1123. # before the first element
  1124. c_node = _findChildForwards(parent._c_node, 0)
  1125. elif c_orig_neighbour is NULL:
  1126. # at the end, but reversed stepping
  1127. # append one element and go to the next insertion point
  1128. for element in elements:
  1129. assert element is not None, "Node must not be None"
  1130. _appendChild(parent, element)
  1131. c_node = element._c_node
  1132. if slicelength > 0:
  1133. slicelength -= 1
  1134. for i in range(1, step):
  1135. c_node = next_element(c_node)
  1136. if c_node is NULL:
  1137. break
  1138. break
  1139. else:
  1140. c_node = c_orig_neighbour
  1141. if left_to_right:
  1142. # adjust step size after removing slice as we are not stepping
  1143. # over the newly inserted elements
  1144. step -= 1
  1145. # now insert elements where we removed them
  1146. if c_node is not NULL:
  1147. for element in elements:
  1148. assert element is not None, "Node must not be None"
  1149. _assertValidNode(element)
  1150. # move element and tail over
  1151. c_source_doc = element._c_node.doc
  1152. c_next = element._c_node.next
  1153. tree.xmlAddPrevSibling(c_node, element._c_node)
  1154. _moveTail(c_next, element._c_node)
  1155. # integrate element into new document
  1156. moveNodeToDocument(parent._doc, c_source_doc, element._c_node)
  1157. # stop at the end of the slice
  1158. if slicelength > 0:
  1159. slicelength -= 1
  1160. for i in range(step):
  1161. c_node = next_element(c_node)
  1162. if c_node is NULL:
  1163. break
  1164. if c_node is NULL:
  1165. break
  1166. else:
  1167. # everything inserted
  1168. return 0
  1169. # append the remaining elements at the respective end
  1170. if left_to_right:
  1171. for element in elements:
  1172. assert element is not None, "Node must not be None"
  1173. _assertValidNode(element)
  1174. _appendChild(parent, element)
  1175. else:
  1176. for element in elements:
  1177. assert element is not None, "Node must not be None"
  1178. _assertValidNode(element)
  1179. _prependChild(parent, element)
  1180. return 0
  1181. cdef int _linkChild(xmlNode* c_parent, xmlNode* c_node) except -1:
  1182. """Adaptation of 'xmlAddChild()' that deep-fix the document links iteratively.
  1183. """
  1184. assert _isElement(c_node)
  1185. c_node.parent = c_parent
  1186. if c_parent.children is NULL:
  1187. c_parent.children = c_parent.last = c_node
  1188. else:
  1189. c_node.prev = c_parent.last
  1190. c_parent.last.next = c_node
  1191. c_parent.last = c_node
  1192. _setTreeDoc(c_node, c_parent.doc)
  1193. return 0
  1194. cdef int _appendChild(_Element parent, _Element child) except -1:
  1195. """Append a new child to a parent element.
  1196. """
  1197. c_node = child._c_node
  1198. c_source_doc = c_node.doc
  1199. # prevent cycles
  1200. if _isAncestorOrSame(c_node, parent._c_node):
  1201. raise ValueError("cannot append parent to itself")
  1202. # store possible text node
  1203. c_next = c_node.next
  1204. # move node itself
  1205. tree.xmlUnlinkNode(c_node)
  1206. # do not call xmlAddChild() here since it would deep-traverse the tree
  1207. _linkChild(parent._c_node, c_node)
  1208. _moveTail(c_next, c_node)
  1209. # uh oh, elements may be pointing to different doc when
  1210. # parent element has moved; change them too..
  1211. moveNodeToDocument(parent._doc, c_source_doc, c_node)
  1212. return 0
  1213. cdef int _prependChild(_Element parent, _Element child) except -1:
  1214. """Prepend a new child to a parent element.
  1215. """
  1216. c_node = child._c_node
  1217. c_source_doc = c_node.doc
  1218. # prevent cycles
  1219. if _isAncestorOrSame(c_node, parent._c_node):
  1220. raise ValueError("cannot append parent to itself")
  1221. # store possible text node
  1222. c_next = c_node.next
  1223. # move node itself
  1224. c_child = _findChildForwards(parent._c_node, 0)
  1225. if c_child is NULL:
  1226. tree.xmlUnlinkNode(c_node)
  1227. # do not call xmlAddChild() here since it would deep-traverse the tree
  1228. _linkChild(parent._c_node, c_node)
  1229. else:
  1230. tree.xmlAddPrevSibling(c_child, c_node)
  1231. _moveTail(c_next, c_node)
  1232. # uh oh, elements may be pointing to different doc when
  1233. # parent element has moved; change them too..
  1234. moveNodeToDocument(parent._doc, c_source_doc, c_node)
  1235. return 0
  1236. cdef int _appendSibling(_Element element, _Element sibling) except -1:
  1237. """Add a new sibling behind an element.
  1238. """
  1239. return _addSibling(element, sibling, as_next=True)
  1240. cdef int _prependSibling(_Element element, _Element sibling) except -1:
  1241. """Add a new sibling before an element.
  1242. """
  1243. return _addSibling(element, sibling, as_next=False)
  1244. cdef int _addSibling(_Element element, _Element sibling, bint as_next) except -1:
  1245. c_node = sibling._c_node
  1246. c_source_doc = c_node.doc
  1247. # prevent cycles
  1248. if _isAncestorOrSame(c_node, element._c_node):
  1249. if element._c_node is c_node:
  1250. return 0 # nothing to do
  1251. raise ValueError("cannot add ancestor as sibling, please break cycle first")
  1252. # store possible text node
  1253. c_next = c_node.next
  1254. # move node itself
  1255. if as_next:
  1256. # must insert after any tail text
  1257. c_next_node = _nextElement(element._c_node)
  1258. if c_next_node is NULL:
  1259. c_next_node = element._c_node
  1260. while c_next_node.next:
  1261. c_next_node = c_next_node.next
  1262. tree.xmlAddNextSibling(c_next_node, c_node)
  1263. else:
  1264. tree.xmlAddPrevSibling(c_next_node, c_node)
  1265. else:
  1266. tree.xmlAddPrevSibling(element._c_node, c_node)
  1267. _moveTail(c_next, c_node)
  1268. # uh oh, elements may be pointing to different doc when
  1269. # parent element has moved; change them too..
  1270. moveNodeToDocument(element._doc, c_source_doc, c_node)
  1271. return 0
  1272. cdef inline bint isutf8(const_xmlChar* s) noexcept:
  1273. cdef xmlChar c = s[0]
  1274. while c != c'\0':
  1275. if c & 0x80:
  1276. return True
  1277. s += 1
  1278. c = s[0]
  1279. return False
  1280. cdef bint isutf8l(const_xmlChar* s, size_t length) noexcept:
  1281. """
  1282. Search for non-ASCII characters in the string, knowing its length in advance.
  1283. """
  1284. cdef unsigned int i
  1285. cdef unsigned long non_ascii_mask
  1286. cdef const unsigned long *lptr = <const unsigned long*> s
  1287. cdef const unsigned long *end = lptr + length // sizeof(unsigned long)
  1288. if length >= sizeof(non_ascii_mask):
  1289. # Build constant 0x80808080... mask (and let the C compiler fold it).
  1290. non_ascii_mask = 0
  1291. for i in range(sizeof(non_ascii_mask) // 2):
  1292. non_ascii_mask = (non_ascii_mask << 16) | 0x8080
  1293. # Advance to long-aligned character before we start reading longs.
  1294. while (<size_t>s) % sizeof(unsigned long) and s < <const_xmlChar *>end:
  1295. if s[0] & 0x80:
  1296. return True
  1297. s += 1
  1298. # Read one long at a time
  1299. lptr = <const unsigned long*> s
  1300. while lptr < end:
  1301. if lptr[0] & non_ascii_mask:
  1302. return True
  1303. lptr += 1
  1304. s = <const_xmlChar *>lptr
  1305. while s < (<const_xmlChar *>end + length % sizeof(unsigned long)):
  1306. if s[0] & 0x80:
  1307. return True
  1308. s += 1
  1309. return False
  1310. cdef int _is_valid_xml_ascii(bytes pystring) except -1:
  1311. """Check if a string is XML ascii content."""
  1312. cdef signed char ch
  1313. # When ch is a *signed* char, non-ascii characters are negative integers
  1314. # and xmlIsChar_ch does not accept them.
  1315. for ch in pystring:
  1316. if not tree.xmlIsChar_ch(ch):
  1317. return 0
  1318. return 1
  1319. cdef bint _is_valid_xml_utf8(bytes pystring) except -1:
  1320. """Check if a string is like valid UTF-8 XML content."""
  1321. cdef const_xmlChar* s = _xcstr(pystring)
  1322. cdef const_xmlChar* c_end = s + len(pystring)
  1323. cdef unsigned long next3 = 0
  1324. if s < c_end - 2:
  1325. next3 = (s[0] << 8) | (s[1])
  1326. while s < c_end - 2:
  1327. next3 = 0x00ffffff & ((next3 << 8) | s[2])
  1328. if s[0] & 0x80:
  1329. # 0xefbfbe and 0xefbfbf are utf-8 encodings of
  1330. # forbidden characters \ufffe and \uffff
  1331. if next3 == 0x00efbfbe or next3 == 0x00efbfbf:
  1332. return 0
  1333. # 0xeda080 and 0xedbfbf are utf-8 encodings of
  1334. # \ud800 and \udfff. Anything between them (inclusive)
  1335. # is forbidden, because they are surrogate blocks in utf-16.
  1336. if 0x00eda080 <= next3 <= 0x00edbfbf:
  1337. return 0
  1338. elif not tree.xmlIsChar_ch(s[0]):
  1339. return 0 # invalid ascii char
  1340. s += 1
  1341. while s < c_end:
  1342. if not s[0] & 0x80 and not tree.xmlIsChar_ch(s[0]):
  1343. return 0 # invalid ascii char
  1344. s += 1
  1345. return 1
  1346. cdef inline unicode funicodeOrNone(const_xmlChar* s):
  1347. return funicode(s) if s is not NULL else None
  1348. cdef inline unicode funicodeOrEmpty(const_xmlChar* s):
  1349. return funicode(s) if s is not NULL else ''
  1350. cdef unicode funicode(const_xmlChar* s):
  1351. return s.decode('UTF-8')
  1352. cdef bytes _utf8(object s):
  1353. """Test if a string is valid user input and encode it to UTF-8.
  1354. Reject all bytes/unicode input that contains non-XML characters.
  1355. Reject all bytes input that contains non-ASCII characters.
  1356. """
  1357. cdef int valid
  1358. cdef bytes utf8_string
  1359. if isinstance(s, unicode):
  1360. utf8_string = (<unicode>s).encode('utf8')
  1361. valid = _is_valid_xml_utf8(utf8_string)
  1362. elif isinstance(s, (bytes, bytearray)):
  1363. utf8_string = s if type(s) is bytes else bytes(s)
  1364. valid = _is_valid_xml_ascii(utf8_string)
  1365. else:
  1366. raise TypeError("Argument must be bytes or unicode, got '%.200s'" % type(s).__name__)
  1367. if not valid:
  1368. raise ValueError(
  1369. "All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters")
  1370. return utf8_string
  1371. cdef bytes _utf8orNone(object s):
  1372. return _utf8(s) if s is not None else None
  1373. cdef enum:
  1374. NO_FILE_PATH = 0
  1375. ABS_UNIX_FILE_PATH = 1
  1376. ABS_WIN_FILE_PATH = 2
  1377. REL_FILE_PATH = 3
  1378. cdef bint _isFilePath(const_xmlChar* c_path) noexcept:
  1379. "simple heuristic to see if a path is a filename"
  1380. cdef xmlChar c
  1381. # test if it looks like an absolute Unix path or a Windows network path
  1382. if c_path[0] == c'/':
  1383. return ABS_UNIX_FILE_PATH
  1384. # test if it looks like an absolute Windows path or URL
  1385. if c'a' <= c_path[0] <= c'z' or c'A' <= c_path[0] <= c'Z':
  1386. c_path += 1
  1387. if c_path[0] == c':' and c_path[1] in b'\0\\':
  1388. return ABS_WIN_FILE_PATH # C: or C:\...
  1389. # test if it looks like a URL with scheme://
  1390. while c'a' <= c_path[0] <= c'z' or c'A' <= c_path[0] <= c'Z':
  1391. c_path += 1
  1392. if c_path[0] == c':' and c_path[1] == c'/' and c_path[2] == c'/':
  1393. return NO_FILE_PATH
  1394. # assume it's a relative path
  1395. return REL_FILE_PATH
  1396. cdef object _getFSPathOrObject(object obj):
  1397. """
  1398. Get the __fspath__ attribute of an object if it exists.
  1399. Otherwise, the original object is returned.
  1400. """
  1401. if _isString(obj):
  1402. return obj
  1403. try:
  1404. return python.PyOS_FSPath(obj)
  1405. except TypeError:
  1406. return obj
  1407. cdef object _encodeFilename(object filename):
  1408. """Make sure a filename is 8-bit encoded (or None).
  1409. """
  1410. if filename is None:
  1411. return None
  1412. elif isinstance(filename, bytes):
  1413. return filename
  1414. elif isinstance(filename, unicode):
  1415. filename8 = (<unicode>filename).encode('utf8')
  1416. if _isFilePath(<unsigned char*>filename8):
  1417. try:
  1418. return python.PyUnicode_AsEncodedString(
  1419. filename, _C_FILENAME_ENCODING, NULL)
  1420. except UnicodeEncodeError:
  1421. pass
  1422. return filename8
  1423. else:
  1424. raise TypeError("Argument must be string or unicode.")
  1425. cdef object _decodeFilename(const_xmlChar* c_path):
  1426. """Make the filename a unicode string if we are in Py3.
  1427. """
  1428. return _decodeFilenameWithLength(c_path, tree.xmlStrlen(c_path))
  1429. cdef object _decodeFilenameWithLength(const_xmlChar* c_path, size_t c_len):
  1430. """Make the filename a unicode string if we are in Py3.
  1431. """
  1432. if _isFilePath(c_path):
  1433. try:
  1434. return python.PyUnicode_Decode(
  1435. <const_char*>c_path, c_len, _C_FILENAME_ENCODING, NULL)
  1436. except UnicodeDecodeError:
  1437. pass
  1438. try:
  1439. return (<unsigned char*>c_path)[:c_len].decode('UTF-8')
  1440. except UnicodeDecodeError:
  1441. # this is a stupid fallback, but it might still work...
  1442. return (<unsigned char*>c_path)[:c_len].decode('latin-1', 'replace')
  1443. cdef object _encodeFilenameUTF8(object filename):
  1444. """Recode filename as UTF-8. Tries ASCII, local filesystem encoding and
  1445. UTF-8 as source encoding.
  1446. """
  1447. cdef char* c_filename
  1448. if filename is None:
  1449. return None
  1450. elif isinstance(filename, bytes):
  1451. if not isutf8l(<bytes>filename, len(<bytes>filename)):
  1452. # plain ASCII!
  1453. return filename
  1454. c_filename = _cstr(<bytes>filename)
  1455. try:
  1456. # try to decode with default encoding
  1457. filename = python.PyUnicode_Decode(
  1458. c_filename, len(<bytes>filename),
  1459. _C_FILENAME_ENCODING, NULL)
  1460. except UnicodeDecodeError as decode_exc:
  1461. try:
  1462. # try if it's proper UTF-8
  1463. (<bytes>filename).decode('utf8')
  1464. return filename
  1465. except UnicodeDecodeError:
  1466. raise decode_exc # otherwise re-raise original exception
  1467. if isinstance(filename, unicode):
  1468. return (<unicode>filename).encode('utf8')
  1469. else:
  1470. raise TypeError("Argument must be string or unicode.")
  1471. cdef tuple _getNsTag(tag):
  1472. """Given a tag, find namespace URI and tag name.
  1473. Return None for NS uri if no namespace URI provided.
  1474. """
  1475. return __getNsTag(tag, 0)
  1476. cdef tuple _getNsTagWithEmptyNs(tag):
  1477. """Given a tag, find namespace URI and tag name. Return None for NS uri
  1478. if no namespace URI provided, or the empty string if namespace
  1479. part is '{}'.
  1480. """
  1481. return __getNsTag(tag, 1)
  1482. cdef tuple __getNsTag(tag, bint empty_ns):
  1483. cdef char* c_tag
  1484. cdef char* c_ns_end
  1485. cdef Py_ssize_t taglen
  1486. cdef Py_ssize_t nslen
  1487. cdef bytes ns = None
  1488. # _isString() is much faster than isinstance()
  1489. if not _isString(tag) and isinstance(tag, QName):
  1490. tag = (<QName>tag).text
  1491. tag = _utf8(tag)
  1492. c_tag = _cstr(tag)
  1493. if c_tag[0] == c'{':
  1494. c_tag += 1
  1495. c_ns_end = cstring_h.strchr(c_tag, c'}')
  1496. if c_ns_end is NULL:
  1497. raise ValueError, "Invalid tag name"
  1498. nslen = c_ns_end - c_tag
  1499. taglen = python.PyBytes_GET_SIZE(tag) - nslen - 2
  1500. if taglen == 0:
  1501. raise ValueError, "Empty tag name"
  1502. if nslen > 0:
  1503. ns = <bytes>c_tag[:nslen]
  1504. elif empty_ns:
  1505. ns = b''
  1506. tag = <bytes>c_ns_end[1:taglen+1]
  1507. elif python.PyBytes_GET_SIZE(tag) == 0:
  1508. raise ValueError, "Empty tag name"
  1509. return ns, tag
  1510. cdef inline int _pyXmlNameIsValid(name_utf8):
  1511. return _xmlNameIsValid(_xcstr(name_utf8)) and b':' not in name_utf8
  1512. cdef inline int _pyHtmlNameIsValid(name_utf8):
  1513. return _htmlNameIsValid(_xcstr(name_utf8))
  1514. cdef inline int _xmlNameIsValid(const_xmlChar* c_name) noexcept:
  1515. return tree.xmlValidateNameValue(c_name)
  1516. cdef int _htmlNameIsValid(const_xmlChar* c_name) noexcept:
  1517. if c_name is NULL or c_name[0] == c'\0':
  1518. return 0
  1519. while c_name[0] != c'\0':
  1520. if c_name[0] in b'&<>/"\'\t\n\x0B\x0C\r ':
  1521. return 0
  1522. c_name += 1
  1523. return 1
  1524. cdef bint _characterReferenceIsValid(const_xmlChar* c_name) noexcept:
  1525. cdef bint is_hex
  1526. if c_name[0] == c'x':
  1527. c_name += 1
  1528. is_hex = 1
  1529. else:
  1530. is_hex = 0
  1531. if c_name[0] == c'\0':
  1532. return 0
  1533. while c_name[0] != c'\0':
  1534. if c_name[0] < c'0' or c_name[0] > c'9':
  1535. if not is_hex:
  1536. return 0
  1537. if not (c'a' <= c_name[0] <= c'f'):
  1538. if not (c'A' <= c_name[0] <= c'F'):
  1539. return 0
  1540. c_name += 1
  1541. return 1
  1542. cdef int _tagValidOrRaise(tag_utf) except -1:
  1543. if not _pyXmlNameIsValid(tag_utf):
  1544. raise ValueError(f"Invalid tag name {(<bytes>tag_utf).decode('utf8')!r}")
  1545. return 0
  1546. cdef int _htmlTagValidOrRaise(tag_utf) except -1:
  1547. if not _pyHtmlNameIsValid(tag_utf):
  1548. raise ValueError(f"Invalid HTML tag name {(<bytes>tag_utf).decode('utf8')!r}")
  1549. return 0
  1550. cdef int _attributeValidOrRaise(name_utf) except -1:
  1551. if not _pyXmlNameIsValid(name_utf):
  1552. raise ValueError(f"Invalid attribute name {(<bytes>name_utf).decode('utf8')!r}")
  1553. return 0
  1554. cdef int _prefixValidOrRaise(tag_utf) except -1:
  1555. if not _pyXmlNameIsValid(tag_utf):
  1556. raise ValueError(f"Invalid namespace prefix {(<bytes>tag_utf).decode('utf8')!r}")
  1557. return 0
  1558. cdef int _uriValidOrRaise(uri_utf) except -1:
  1559. cdef uri.xmlURI* c_uri = uri.xmlParseURI(_cstr(uri_utf))
  1560. if c_uri is NULL:
  1561. raise ValueError(f"Invalid namespace URI {(<bytes>uri_utf).decode('utf8')!r}")
  1562. uri.xmlFreeURI(c_uri)
  1563. return 0
  1564. cdef inline unicode _namespacedName(xmlNode* c_node):
  1565. return _namespacedNameFromNsName(_getNs(c_node), c_node.name)
  1566. cdef unicode _namespacedNameFromNsName(const_xmlChar* c_href, const_xmlChar* c_name):
  1567. name = funicode(c_name)
  1568. if c_href is NULL:
  1569. return name
  1570. href = funicode(c_href)
  1571. return f"{{{href}}}{name}"
  1572. cdef _getFilenameForFile(source):
  1573. """Given a Python File or Gzip object, give filename back.
  1574. Returns None if not a file object.
  1575. """
  1576. # urllib2 provides a geturl() method
  1577. try:
  1578. return source.geturl()
  1579. except:
  1580. pass
  1581. # file instances have a name attribute
  1582. try:
  1583. filename = source.name
  1584. if _isString(filename):
  1585. return os_path_abspath(filename)
  1586. except:
  1587. pass
  1588. # gzip file instances have a filename attribute (before Py3k)
  1589. try:
  1590. filename = source.filename
  1591. if _isString(filename):
  1592. return os_path_abspath(filename)
  1593. except:
  1594. pass
  1595. # can't determine filename
  1596. return None