You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

3854 lines
135 KiB

  1. # cython: binding=True
  2. # cython: auto_pickle=False
  3. # cython: language_level=3
  4. """
  5. The ``lxml.etree`` module implements the extended ElementTree API for XML.
  6. """
  7. __docformat__ = "restructuredtext en"
  8. __all__ = [
  9. 'AttributeBasedElementClassLookup', 'C14NError', 'C14NWriterTarget', 'CDATA',
  10. 'Comment', 'CommentBase', 'CustomElementClassLookup', 'DEBUG',
  11. 'DTD', 'DTDError', 'DTDParseError', 'DTDValidateError',
  12. 'DocumentInvalid', 'ETCompatXMLParser', 'ETXPath', 'Element',
  13. 'ElementBase', 'ElementClassLookup', 'ElementDefaultClassLookup',
  14. 'ElementNamespaceClassLookup', 'ElementTree', 'Entity', 'EntityBase',
  15. 'Error', 'ErrorDomains', 'ErrorLevels', 'ErrorTypes', 'Extension',
  16. 'FallbackElementClassLookup', 'FunctionNamespace', 'HTML', 'HTMLParser',
  17. 'ICONV_COMPILED_VERSION',
  18. 'LIBXML_COMPILED_VERSION', 'LIBXML_VERSION',
  19. 'LIBXML_FEATURES',
  20. 'LIBXSLT_COMPILED_VERSION', 'LIBXSLT_VERSION',
  21. 'LXML_VERSION',
  22. 'LxmlError', 'LxmlRegistryError', 'LxmlSyntaxError',
  23. 'NamespaceRegistryError', 'PI', 'PIBase', 'ParseError',
  24. 'ParserBasedElementClassLookup', 'ParserError', 'ProcessingInstruction',
  25. 'PyErrorLog', 'PythonElementClassLookup', 'QName', 'RelaxNG',
  26. 'RelaxNGError', 'RelaxNGErrorTypes', 'RelaxNGParseError',
  27. 'RelaxNGValidateError', 'Resolver', 'Schematron', 'SchematronError',
  28. 'SchematronParseError', 'SchematronValidateError', 'SerialisationError',
  29. 'SubElement', 'TreeBuilder', 'XInclude', 'XIncludeError', 'XML',
  30. 'XMLDTDID', 'XMLID', 'XMLParser', 'XMLSchema', 'XMLSchemaError',
  31. 'XMLSchemaParseError', 'XMLSchemaValidateError', 'XMLSyntaxError',
  32. 'XMLTreeBuilder', 'XPath', 'XPathDocumentEvaluator', 'XPathError',
  33. 'XPathEvalError', 'XPathEvaluator', 'XPathFunctionError', 'XPathResultError',
  34. 'XPathSyntaxError', 'XSLT', 'XSLTAccessControl', 'XSLTApplyError',
  35. 'XSLTError', 'XSLTExtension', 'XSLTExtensionError', 'XSLTParseError',
  36. 'XSLTSaveError', 'canonicalize',
  37. 'cleanup_namespaces', 'clear_error_log', 'dump',
  38. 'fromstring', 'fromstringlist', 'get_default_parser', 'iselement',
  39. 'iterparse', 'iterwalk', 'parse', 'parseid', 'register_namespace',
  40. 'set_default_parser', 'set_element_class_lookup', 'strip_attributes',
  41. 'strip_elements', 'strip_tags', 'tostring', 'tostringlist', 'tounicode',
  42. 'use_global_python_log'
  43. ]
  44. cimport cython
  45. from lxml cimport python
  46. from lxml.includes cimport tree, config
  47. from lxml.includes.tree cimport xmlDoc, xmlNode, xmlAttr, xmlNs, _isElement, _getNs
  48. from lxml.includes.tree cimport const_xmlChar, xmlChar, _xcstr
  49. from lxml.python cimport _cstr, _isString
  50. from lxml.includes cimport xpath
  51. from lxml.includes cimport c14n
  52. # Cython's standard declarations
  53. cimport cpython.mem
  54. cimport cpython.ref
  55. from libc cimport limits, stdio, stdlib
  56. from libc cimport string as cstring_h # not to be confused with stdlib 'string'
  57. from libc.string cimport const_char
  58. cdef object os_path_abspath
  59. from os.path import abspath as os_path_abspath
  60. cdef object BytesIO, StringIO
  61. from io import BytesIO, StringIO
  62. cdef object OrderedDict
  63. from collections import OrderedDict
  64. cdef object _elementpath
  65. from lxml import _elementpath
  66. cdef object sys
  67. import sys
  68. cdef object re
  69. import re
  70. cdef object partial
  71. from functools import partial
  72. cdef object islice
  73. from itertools import islice
  74. cdef object ITER_EMPTY = iter(())
  75. cdef object MutableMapping
  76. from collections.abc import MutableMapping
  77. class _ImmutableMapping(MutableMapping):
  78. def __getitem__(self, key):
  79. raise KeyError, key
  80. def __setitem__(self, key, value):
  81. raise KeyError, key
  82. def __delitem__(self, key):
  83. raise KeyError, key
  84. def __contains__(self, key):
  85. return False
  86. def __len__(self):
  87. return 0
  88. def __iter__(self):
  89. return ITER_EMPTY
  90. iterkeys = itervalues = iteritems = __iter__
  91. cdef object IMMUTABLE_EMPTY_MAPPING = _ImmutableMapping()
  92. del _ImmutableMapping
  93. # the rules
  94. # ---------
  95. # any libxml C argument/variable is prefixed with c_
  96. # any non-public function/class is prefixed with an underscore
  97. # instance creation is always through factories
  98. # what to do with libxml2/libxslt error messages?
  99. # 0 : drop
  100. # 1 : use log
  101. DEF __DEBUG = 1
  102. # maximum number of lines in the libxml2/xslt log if __DEBUG == 1
  103. DEF __MAX_LOG_SIZE = 100
  104. # make the compiled-in debug state publicly available
  105. DEBUG = __DEBUG
  106. # A struct to store a cached qualified tag name+href pair.
  107. # While we can borrow the c_name from the document dict,
  108. # PyPy requires us to store a Python reference for the
  109. # namespace in order to keep the byte buffer alive.
  110. cdef struct qname:
  111. const_xmlChar* c_name
  112. python.PyObject* href
  113. # initialize parser (and threading)
  114. xmlparser.xmlInitParser()
  115. # global per-thread setup
  116. tree.xmlThrDefIndentTreeOutput(1)
  117. tree.xmlThrDefLineNumbersDefaultValue(1)
  118. _initThreadLogging()
  119. # filename encoding
  120. cdef bytes _FILENAME_ENCODING = (sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii').encode("UTF-8")
  121. cdef char* _C_FILENAME_ENCODING = _cstr(_FILENAME_ENCODING)
  122. # set up some default namespace prefixes
  123. cdef dict _DEFAULT_NAMESPACE_PREFIXES = {
  124. b"http://www.w3.org/XML/1998/namespace": b'xml',
  125. b"http://www.w3.org/1999/xhtml": b"html",
  126. b"http://www.w3.org/1999/XSL/Transform": b"xsl",
  127. b"http://www.w3.org/1999/02/22-rdf-syntax-ns#": b"rdf",
  128. b"http://schemas.xmlsoap.org/wsdl/": b"wsdl",
  129. # xml schema
  130. b"http://www.w3.org/2001/XMLSchema": b"xs",
  131. b"http://www.w3.org/2001/XMLSchema-instance": b"xsi",
  132. # dublin core
  133. b"http://purl.org/dc/elements/1.1/": b"dc",
  134. # objectify
  135. b"http://codespeak.net/lxml/objectify/pytype" : b"py",
  136. }
  137. # To avoid runtime encoding overhead, we keep a Unicode copy
  138. # of the uri-prefix mapping as (str, str) items view.
  139. cdef object _DEFAULT_NAMESPACE_PREFIXES_ITEMS = []
  140. cdef _update_default_namespace_prefixes_items():
  141. cdef bytes ns, prefix
  142. global _DEFAULT_NAMESPACE_PREFIXES_ITEMS
  143. _DEFAULT_NAMESPACE_PREFIXES_ITEMS = {
  144. ns.decode('utf-8') : prefix.decode('utf-8')
  145. for ns, prefix in _DEFAULT_NAMESPACE_PREFIXES.items()
  146. }.items()
  147. _update_default_namespace_prefixes_items()
  148. cdef object _check_internal_prefix = re.compile(br"ns\d+$").match
  149. def register_namespace(prefix, uri):
  150. """Registers a namespace prefix that newly created Elements in that
  151. namespace will use. The registry is global, and any existing
  152. mapping for either the given prefix or the namespace URI will be
  153. removed.
  154. """
  155. prefix_utf, uri_utf = _utf8(prefix), _utf8(uri)
  156. if _check_internal_prefix(prefix_utf):
  157. raise ValueError("Prefix format reserved for internal use")
  158. _tagValidOrRaise(prefix_utf)
  159. _uriValidOrRaise(uri_utf)
  160. if (uri_utf == b"http://www.w3.org/XML/1998/namespace" and prefix_utf != b'xml'
  161. or prefix_utf == b'xml' and uri_utf != b"http://www.w3.org/XML/1998/namespace"):
  162. raise ValueError("Cannot change the 'xml' prefix of the XML namespace")
  163. for k, v in list(_DEFAULT_NAMESPACE_PREFIXES.items()):
  164. if k == uri_utf or v == prefix_utf:
  165. del _DEFAULT_NAMESPACE_PREFIXES[k]
  166. _DEFAULT_NAMESPACE_PREFIXES[uri_utf] = prefix_utf
  167. _update_default_namespace_prefixes_items()
  168. # Error superclass for ElementTree compatibility
  169. cdef class Error(Exception):
  170. pass
  171. # module level superclass for all exceptions
  172. cdef class LxmlError(Error):
  173. """Main exception base class for lxml. All other exceptions inherit from
  174. this one.
  175. """
  176. def __init__(self, message, error_log=None):
  177. super(_Error, self).__init__(message)
  178. if error_log is None:
  179. self.error_log = __copyGlobalErrorLog()
  180. else:
  181. self.error_log = error_log.copy()
  182. cdef object _Error = Error
  183. # superclass for all syntax errors
  184. class LxmlSyntaxError(LxmlError, SyntaxError):
  185. """Base class for all syntax errors.
  186. """
  187. cdef class C14NError(LxmlError):
  188. """Error during C14N serialisation.
  189. """
  190. # version information
  191. cdef tuple __unpackDottedVersion(version):
  192. version_list = []
  193. l = (version.decode("ascii").replace('-', '.').split('.') + [0]*4)[:4]
  194. for item in l:
  195. try:
  196. item = int(item)
  197. except ValueError:
  198. if item.startswith('dev'):
  199. count = item[3:]
  200. item = -300
  201. elif item.startswith('alpha'):
  202. count = item[5:]
  203. item = -200
  204. elif item.startswith('beta'):
  205. count = item[4:]
  206. item = -100
  207. else:
  208. count = 0
  209. if count:
  210. item += int(count)
  211. version_list.append(item)
  212. return tuple(version_list)
  213. cdef tuple __unpackIntVersion(int c_version, int base=100):
  214. return (
  215. ((c_version // (base*base)) % base),
  216. ((c_version // base) % base),
  217. (c_version % base)
  218. )
  219. cdef int _LIBXML_VERSION_INT
  220. try:
  221. _LIBXML_VERSION_INT = int(
  222. re.match('[0-9]+', (<unsigned char*>tree.xmlParserVersion).decode("ascii")).group(0))
  223. except Exception:
  224. print("Unknown libxml2 version: " + (<unsigned char*>tree.xmlParserVersion).decode("latin1"))
  225. _LIBXML_VERSION_INT = 0
  226. LIBXML_VERSION = __unpackIntVersion(_LIBXML_VERSION_INT)
  227. LIBXML_COMPILED_VERSION = __unpackIntVersion(tree.LIBXML_VERSION)
  228. LXML_VERSION = __unpackDottedVersion(tree.LXML_VERSION_STRING)
  229. __version__ = tree.LXML_VERSION_STRING.decode("ascii")
  230. cdef extern from *:
  231. """
  232. #ifdef ZLIB_VERNUM
  233. #define __lxml_zlib_version (ZLIB_VERNUM >> 4)
  234. #else
  235. #define __lxml_zlib_version 0
  236. #endif
  237. #ifdef _LIBICONV_VERSION
  238. #define __lxml_iconv_version (_LIBICONV_VERSION << 8)
  239. #else
  240. #define __lxml_iconv_version 0
  241. #endif
  242. """
  243. # zlib isn't included automatically by libxml2's headers
  244. #long ZLIB_HEX_VERSION "__lxml_zlib_version"
  245. long LIBICONV_HEX_VERSION "__lxml_iconv_version"
  246. #ZLIB_COMPILED_VERSION = __unpackIntVersion(ZLIB_HEX_VERSION, base=0x10)
  247. ICONV_COMPILED_VERSION = __unpackIntVersion(LIBICONV_HEX_VERSION, base=0x100)[:2]
  248. cdef extern from "libxml/xmlversion.h":
  249. """
  250. static const char* const _lxml_lib_features[] = {
  251. #ifdef LIBXML_HTML_ENABLED
  252. "html",
  253. #endif
  254. #ifdef LIBXML_FTP_ENABLED
  255. "ftp",
  256. #endif
  257. #ifdef LIBXML_HTTP_ENABLED
  258. "http",
  259. #endif
  260. #ifdef LIBXML_CATALOG_ENABLED
  261. "catalog",
  262. #endif
  263. #ifdef LIBXML_XPATH_ENABLED
  264. "xpath",
  265. #endif
  266. #ifdef LIBXML_ICONV_ENABLED
  267. "iconv",
  268. #endif
  269. #ifdef LIBXML_ICU_ENABLED
  270. "icu",
  271. #endif
  272. #ifdef LIBXML_REGEXP_ENABLED
  273. "regexp",
  274. #endif
  275. #ifdef LIBXML_SCHEMAS_ENABLED
  276. "xmlschema",
  277. #endif
  278. #ifdef LIBXML_SCHEMATRON_ENABLED
  279. "schematron",
  280. #endif
  281. #ifdef LIBXML_ZLIB_ENABLED
  282. "zlib",
  283. #endif
  284. #ifdef LIBXML_LZMA_ENABLED
  285. "lzma",
  286. #endif
  287. 0
  288. };
  289. """
  290. const char* const* _LXML_LIB_FEATURES "_lxml_lib_features"
  291. cdef set _copy_lib_features():
  292. features = set()
  293. feature = _LXML_LIB_FEATURES
  294. while feature[0]:
  295. features.add(feature[0].decode('ASCII'))
  296. feature += 1
  297. return features
  298. LIBXML_COMPILED_FEATURES = _copy_lib_features()
  299. LIBXML_FEATURES = {
  300. feature_name for feature_id, feature_name in [
  301. #XML_WITH_THREAD = 1
  302. #XML_WITH_TREE = 2
  303. #XML_WITH_OUTPUT = 3
  304. #XML_WITH_PUSH = 4
  305. #XML_WITH_READER = 5
  306. #XML_WITH_PATTERN = 6
  307. #XML_WITH_WRITER = 7
  308. #XML_WITH_SAX1 = 8
  309. (xmlparser.XML_WITH_FTP, "ftp"), # XML_WITH_FTP = 9
  310. (xmlparser.XML_WITH_HTTP, "http"), # XML_WITH_HTTP = 10
  311. #XML_WITH_VALID = 11
  312. (xmlparser.XML_WITH_HTML, "html"), # XML_WITH_HTML = 12
  313. #XML_WITH_LEGACY = 13
  314. #XML_WITH_C14N = 14
  315. (xmlparser.XML_WITH_CATALOG, "catalog"), # XML_WITH_CATALOG = 15
  316. (xmlparser.XML_WITH_XPATH, "xpath"), # XML_WITH_XPATH = 16
  317. #XML_WITH_XPTR = 17
  318. #XML_WITH_XINCLUDE = 18
  319. (xmlparser.XML_WITH_ICONV, "iconv"), # XML_WITH_ICONV = 19
  320. #XML_WITH_ISO8859X = 20
  321. #XML_WITH_UNICODE = 21
  322. (xmlparser.XML_WITH_REGEXP, "regexp"), # XML_WITH_REGEXP = 22
  323. #XML_WITH_AUTOMATA = 23
  324. #XML_WITH_EXPR = 24
  325. (xmlparser.XML_WITH_SCHEMAS, "xmlschema"), # XML_WITH_SCHEMAS = 25
  326. (xmlparser.XML_WITH_SCHEMATRON, "schematron"), # XML_WITH_SCHEMATRON = 26
  327. #XML_WITH_MODULES = 27
  328. #XML_WITH_DEBUG = 28
  329. #XML_WITH_DEBUG_MEM = 29
  330. #XML_WITH_DEBUG_RUN = 30 # unused
  331. (xmlparser.XML_WITH_ZLIB, "zlib"), # XML_WITH_ZLIB = 31
  332. (xmlparser.XML_WITH_ICU, "icu"), # XML_WITH_ICU = 32
  333. (xmlparser.XML_WITH_LZMA, "lzma"), # XML_WITH_LZMA = 33
  334. ] if xmlparser.xmlHasFeature(feature_id)
  335. }
  336. cdef bint HAS_ZLIB_COMPRESSION = xmlparser.xmlHasFeature(xmlparser.XML_WITH_ZLIB)
  337. # class for temporary storage of Python references,
  338. # used e.g. for XPath results
  339. @cython.final
  340. @cython.internal
  341. cdef class _TempStore:
  342. cdef list _storage
  343. def __init__(self):
  344. self._storage = []
  345. cdef int add(self, obj) except -1:
  346. self._storage.append(obj)
  347. return 0
  348. cdef int clear(self) except -1:
  349. del self._storage[:]
  350. return 0
  351. # class for temporarily storing exceptions raised in extensions
  352. @cython.internal
  353. cdef class _ExceptionContext:
  354. cdef object _exc_info
  355. cdef int clear(self) except -1:
  356. self._exc_info = None
  357. return 0
  358. cdef void _store_raised(self) noexcept:
  359. try:
  360. self._exc_info = sys.exc_info()
  361. except BaseException as e:
  362. self._store_exception(e)
  363. finally:
  364. return # and swallow any further exceptions
  365. cdef int _store_exception(self, exception) except -1:
  366. self._exc_info = (exception, None, None)
  367. return 0
  368. cdef bint _has_raised(self) except -1:
  369. return self._exc_info is not None
  370. cdef int _raise_if_stored(self) except -1:
  371. if self._exc_info is None:
  372. return 0
  373. type, value, traceback = self._exc_info
  374. self._exc_info = None
  375. if value is None and traceback is None:
  376. raise type
  377. else:
  378. raise type, value, traceback
  379. # type of a function that steps from node to node
  380. ctypedef public xmlNode* (*_node_to_node_function)(xmlNode*)
  381. ################################################################################
  382. # Include submodules
  383. include "proxy.pxi" # Proxy handling (element backpointers/memory/etc.)
  384. include "apihelpers.pxi" # Private helper functions
  385. include "xmlerror.pxi" # Error and log handling
  386. ################################################################################
  387. # Public Python API
  388. @cython.final
  389. @cython.freelist(8)
  390. cdef public class _Document [ type LxmlDocumentType, object LxmlDocument ]:
  391. """Internal base class to reference a libxml document.
  392. When instances of this class are garbage collected, the libxml
  393. document is cleaned up.
  394. """
  395. cdef int _ns_counter
  396. cdef bytes _prefix_tail
  397. cdef xmlDoc* _c_doc
  398. cdef _BaseParser _parser
  399. def __dealloc__(self):
  400. # if there are no more references to the document, it is safe
  401. # to clean the whole thing up, as all nodes have a reference to
  402. # the document
  403. tree.xmlFreeDoc(self._c_doc)
  404. @cython.final
  405. cdef getroot(self):
  406. # return an element proxy for the document root
  407. cdef xmlNode* c_node
  408. c_node = tree.xmlDocGetRootElement(self._c_doc)
  409. if c_node is NULL:
  410. return None
  411. return _elementFactory(self, c_node)
  412. @cython.final
  413. cdef bint hasdoctype(self) noexcept:
  414. # DOCTYPE gets parsed into internal subset (xmlDTD*)
  415. return self._c_doc is not NULL and self._c_doc.intSubset is not NULL
  416. @cython.final
  417. cdef getdoctype(self):
  418. # get doctype info: root tag, public/system ID (or None if not known)
  419. cdef tree.xmlDtd* c_dtd
  420. cdef xmlNode* c_root_node
  421. public_id = None
  422. sys_url = None
  423. c_dtd = self._c_doc.intSubset
  424. if c_dtd is not NULL:
  425. if c_dtd.ExternalID is not NULL:
  426. public_id = funicode(c_dtd.ExternalID)
  427. if c_dtd.SystemID is not NULL:
  428. sys_url = funicode(c_dtd.SystemID)
  429. c_dtd = self._c_doc.extSubset
  430. if c_dtd is not NULL:
  431. if not public_id and c_dtd.ExternalID is not NULL:
  432. public_id = funicode(c_dtd.ExternalID)
  433. if not sys_url and c_dtd.SystemID is not NULL:
  434. sys_url = funicode(c_dtd.SystemID)
  435. c_root_node = tree.xmlDocGetRootElement(self._c_doc)
  436. if c_root_node is NULL:
  437. root_name = None
  438. else:
  439. root_name = funicode(c_root_node.name)
  440. return root_name, public_id, sys_url
  441. @cython.final
  442. cdef getxmlinfo(self):
  443. # return XML version and encoding (or None if not known)
  444. cdef xmlDoc* c_doc = self._c_doc
  445. if c_doc.version is NULL:
  446. version = None
  447. else:
  448. version = funicode(c_doc.version)
  449. if c_doc.encoding is NULL:
  450. encoding = None
  451. else:
  452. encoding = funicode(c_doc.encoding)
  453. return version, encoding
  454. @cython.final
  455. cdef isstandalone(self):
  456. # returns True for "standalone=true",
  457. # False for "standalone=false", None if not provided
  458. if self._c_doc.standalone == -1:
  459. return None
  460. else:
  461. return <bint>(self._c_doc.standalone == 1)
  462. @cython.final
  463. cdef bytes buildNewPrefix(self):
  464. # get a new unique prefix ("nsX") for this document
  465. cdef bytes ns
  466. if self._ns_counter < len(_PREFIX_CACHE):
  467. ns = _PREFIX_CACHE[self._ns_counter]
  468. else:
  469. ns = python.PyBytes_FromFormat("ns%d", self._ns_counter)
  470. if self._prefix_tail is not None:
  471. ns += self._prefix_tail
  472. self._ns_counter += 1
  473. if self._ns_counter < 0:
  474. # overflow!
  475. self._ns_counter = 0
  476. if self._prefix_tail is None:
  477. self._prefix_tail = b"A"
  478. else:
  479. self._prefix_tail += b"A"
  480. return ns
  481. @cython.final
  482. cdef xmlNs* _findOrBuildNodeNs(self, xmlNode* c_node,
  483. const_xmlChar* c_href, const_xmlChar* c_prefix,
  484. bint is_attribute) except NULL:
  485. """Get or create namespace structure for a node. Reuses the prefix if
  486. possible.
  487. """
  488. cdef xmlNs* c_ns
  489. cdef xmlNs* c_doc_ns
  490. cdef python.PyObject* dict_result
  491. if c_node.type != tree.XML_ELEMENT_NODE:
  492. assert c_node.type == tree.XML_ELEMENT_NODE, \
  493. "invalid node type %d, expected %d" % (
  494. c_node.type, tree.XML_ELEMENT_NODE)
  495. # look for existing ns declaration
  496. c_ns = _searchNsByHref(c_node, c_href, is_attribute)
  497. if c_ns is not NULL:
  498. if is_attribute and c_ns.prefix is NULL:
  499. # do not put namespaced attributes into the default
  500. # namespace as this would break serialisation
  501. pass
  502. else:
  503. return c_ns
  504. # none found => determine a suitable new prefix
  505. if c_prefix is NULL:
  506. dict_result = python.PyDict_GetItem(
  507. _DEFAULT_NAMESPACE_PREFIXES, <unsigned char*>c_href)
  508. if dict_result is not NULL:
  509. prefix = <object>dict_result
  510. else:
  511. prefix = self.buildNewPrefix()
  512. c_prefix = _xcstr(prefix)
  513. # make sure the prefix is not in use already
  514. while tree.xmlSearchNs(self._c_doc, c_node, c_prefix) is not NULL:
  515. prefix = self.buildNewPrefix()
  516. c_prefix = _xcstr(prefix)
  517. # declare the namespace and return it
  518. c_ns = tree.xmlNewNs(c_node, c_href, c_prefix)
  519. if c_ns is NULL:
  520. raise MemoryError()
  521. return c_ns
  522. @cython.final
  523. cdef int _setNodeNs(self, xmlNode* c_node, const_xmlChar* c_href) except -1:
  524. "Lookup namespace structure and set it for the node."
  525. c_ns = self._findOrBuildNodeNs(c_node, c_href, NULL, 0)
  526. tree.xmlSetNs(c_node, c_ns)
  527. cdef tuple __initPrefixCache():
  528. cdef int i
  529. return tuple([ python.PyBytes_FromFormat("ns%d", i)
  530. for i in range(26) ])
  531. cdef tuple _PREFIX_CACHE = __initPrefixCache()
  532. cdef _Document _documentFactory(xmlDoc* c_doc, _BaseParser parser):
  533. cdef _Document result
  534. result = _Document.__new__(_Document)
  535. result._c_doc = c_doc
  536. result._ns_counter = 0
  537. result._prefix_tail = None
  538. if parser is None:
  539. parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
  540. result._parser = parser
  541. return result
  542. cdef object _find_invalid_public_id_characters = re.compile(
  543. ur"[^\x20\x0D\x0Aa-zA-Z0-9'()+,./:=?;!*#@$_%-]+").search
  544. cdef class DocInfo:
  545. "Document information provided by parser and DTD."
  546. cdef _Document _doc
  547. def __cinit__(self, tree):
  548. "Create a DocInfo object for an ElementTree object or root Element."
  549. self._doc = _documentOrRaise(tree)
  550. root_name, public_id, system_url = self._doc.getdoctype()
  551. if not root_name and (public_id or system_url):
  552. raise ValueError, "Could not find root node"
  553. @property
  554. def root_name(self):
  555. """Returns the name of the root node as defined by the DOCTYPE."""
  556. root_name, public_id, system_url = self._doc.getdoctype()
  557. return root_name
  558. @cython.final
  559. cdef tree.xmlDtd* _get_c_dtd(self):
  560. """"Return the DTD. Create it if it does not yet exist."""
  561. cdef xmlDoc* c_doc = self._doc._c_doc
  562. cdef xmlNode* c_root_node
  563. cdef const_xmlChar* c_name
  564. if c_doc.intSubset:
  565. return c_doc.intSubset
  566. c_root_node = tree.xmlDocGetRootElement(c_doc)
  567. c_name = c_root_node.name if c_root_node else NULL
  568. return tree.xmlCreateIntSubset(c_doc, c_name, NULL, NULL)
  569. def clear(self):
  570. """Removes DOCTYPE and internal subset from the document."""
  571. cdef xmlDoc* c_doc = self._doc._c_doc
  572. cdef tree.xmlNode* c_dtd = <xmlNode*>c_doc.intSubset
  573. if c_dtd is NULL:
  574. return
  575. tree.xmlUnlinkNode(c_dtd)
  576. tree.xmlFreeNode(c_dtd)
  577. property public_id:
  578. """Public ID of the DOCTYPE.
  579. Mutable. May be set to a valid string or None. If a DTD does not
  580. exist, setting this variable (even to None) will create one.
  581. """
  582. def __get__(self):
  583. root_name, public_id, system_url = self._doc.getdoctype()
  584. return public_id
  585. def __set__(self, value):
  586. cdef xmlChar* c_value = NULL
  587. if value is not None:
  588. match = _find_invalid_public_id_characters(value)
  589. if match:
  590. raise ValueError, f'Invalid character(s) {match.group(0)!r} in public_id.'
  591. value = _utf8(value)
  592. c_value = tree.xmlStrdup(_xcstr(value))
  593. if not c_value:
  594. raise MemoryError()
  595. c_dtd = self._get_c_dtd()
  596. if not c_dtd:
  597. tree.xmlFree(c_value)
  598. raise MemoryError()
  599. if c_dtd.ExternalID:
  600. tree.xmlFree(<void*>c_dtd.ExternalID)
  601. c_dtd.ExternalID = c_value
  602. property system_url:
  603. """System ID of the DOCTYPE.
  604. Mutable. May be set to a valid string or None. If a DTD does not
  605. exist, setting this variable (even to None) will create one.
  606. """
  607. def __get__(self):
  608. root_name, public_id, system_url = self._doc.getdoctype()
  609. return system_url
  610. def __set__(self, value):
  611. cdef xmlChar* c_value = NULL
  612. if value is not None:
  613. bvalue = _utf8(value)
  614. # sys_url may be any valid unicode string that can be
  615. # enclosed in single quotes or quotes.
  616. if b"'" in bvalue and b'"' in bvalue:
  617. raise ValueError(
  618. 'System URL may not contain both single (\') and double quotes (").')
  619. c_value = tree.xmlStrdup(_xcstr(bvalue))
  620. if not c_value:
  621. raise MemoryError()
  622. c_dtd = self._get_c_dtd()
  623. if not c_dtd:
  624. tree.xmlFree(c_value)
  625. raise MemoryError()
  626. if c_dtd.SystemID:
  627. tree.xmlFree(<void*>c_dtd.SystemID)
  628. c_dtd.SystemID = c_value
  629. @property
  630. def xml_version(self):
  631. """Returns the XML version as declared by the document."""
  632. xml_version, encoding = self._doc.getxmlinfo()
  633. return xml_version
  634. @property
  635. def encoding(self):
  636. """Returns the encoding name as declared by the document."""
  637. xml_version, encoding = self._doc.getxmlinfo()
  638. return encoding
  639. @property
  640. def standalone(self):
  641. """Returns the standalone flag as declared by the document. The possible
  642. values are True (``standalone='yes'``), False
  643. (``standalone='no'`` or flag not provided in the declaration),
  644. and None (unknown or no declaration found). Note that a
  645. normal truth test on this value will always tell if the
  646. ``standalone`` flag was set to ``'yes'`` or not.
  647. """
  648. return self._doc.isstandalone()
  649. property URL:
  650. "The source URL of the document (or None if unknown)."
  651. def __get__(self):
  652. if self._doc._c_doc.URL is NULL:
  653. return None
  654. return _decodeFilename(self._doc._c_doc.URL)
  655. def __set__(self, url):
  656. url = _encodeFilename(url)
  657. c_oldurl = self._doc._c_doc.URL
  658. if url is None:
  659. self._doc._c_doc.URL = NULL
  660. else:
  661. self._doc._c_doc.URL = tree.xmlStrdup(_xcstr(url))
  662. if c_oldurl is not NULL:
  663. tree.xmlFree(<void*>c_oldurl)
  664. @property
  665. def doctype(self):
  666. """Returns a DOCTYPE declaration string for the document."""
  667. root_name, public_id, system_url = self._doc.getdoctype()
  668. if system_url:
  669. # If '"' in system_url, we must escape it with single
  670. # quotes, otherwise escape with double quotes. If url
  671. # contains both a single quote and a double quote, XML
  672. # standard is being violated.
  673. if '"' in system_url:
  674. quoted_system_url = f"'{system_url}'"
  675. else:
  676. quoted_system_url = f'"{system_url}"'
  677. if public_id:
  678. if system_url:
  679. return f'<!DOCTYPE {root_name} PUBLIC "{public_id}" {quoted_system_url}>'
  680. else:
  681. return f'<!DOCTYPE {root_name} PUBLIC "{public_id}">'
  682. elif system_url:
  683. return f'<!DOCTYPE {root_name} SYSTEM {quoted_system_url}>'
  684. elif self._doc.hasdoctype():
  685. return f'<!DOCTYPE {root_name}>'
  686. else:
  687. return ''
  688. @property
  689. def internalDTD(self):
  690. """Returns a DTD validator based on the internal subset of the document."""
  691. return _dtdFactory(self._doc._c_doc.intSubset)
  692. @property
  693. def externalDTD(self):
  694. """Returns a DTD validator based on the external subset of the document."""
  695. return _dtdFactory(self._doc._c_doc.extSubset)
  696. @cython.no_gc_clear
  697. cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
  698. """Element class.
  699. References a document object and a libxml node.
  700. By pointing to a Document instance, a reference is kept to
  701. _Document as long as there is some pointer to a node in it.
  702. """
  703. cdef _Document _doc
  704. cdef xmlNode* _c_node
  705. cdef object _tag
  706. def _init(self):
  707. """_init(self)
  708. Called after object initialisation. Custom subclasses may override
  709. this if they recursively call _init() in the superclasses.
  710. """
  711. @cython.linetrace(False)
  712. @cython.profile(False)
  713. def __dealloc__(self):
  714. #print("trying to free node:", <int>self._c_node)
  715. #displayNode(self._c_node, 0)
  716. if self._c_node is not NULL:
  717. _unregisterProxy(self)
  718. attemptDeallocation(self._c_node)
  719. # MANIPULATORS
  720. def __setitem__(self, x, value):
  721. """__setitem__(self, x, value)
  722. Replaces the given subelement index or slice.
  723. """
  724. cdef xmlNode* c_node = NULL
  725. cdef xmlNode* c_next
  726. cdef xmlDoc* c_source_doc
  727. cdef _Element element
  728. cdef bint left_to_right
  729. cdef Py_ssize_t slicelength = 0, step = 0
  730. _assertValidNode(self)
  731. if value is None:
  732. raise ValueError, "cannot assign None"
  733. if isinstance(x, slice):
  734. # slice assignment
  735. _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
  736. if step > 0:
  737. left_to_right = 1
  738. else:
  739. left_to_right = 0
  740. step = -step
  741. _replaceSlice(self, c_node, slicelength, step, left_to_right, value)
  742. return
  743. else:
  744. # otherwise: normal item assignment
  745. element = value
  746. _assertValidNode(element)
  747. c_node = _findChild(self._c_node, x)
  748. if c_node is NULL:
  749. raise IndexError, "list index out of range"
  750. c_source_doc = element._c_node.doc
  751. c_next = element._c_node.next
  752. _removeText(c_node.next)
  753. tree.xmlReplaceNode(c_node, element._c_node)
  754. _moveTail(c_next, element._c_node)
  755. moveNodeToDocument(self._doc, c_source_doc, element._c_node)
  756. if not attemptDeallocation(c_node):
  757. moveNodeToDocument(self._doc, c_node.doc, c_node)
  758. def __delitem__(self, x):
  759. """__delitem__(self, x)
  760. Deletes the given subelement or a slice.
  761. """
  762. cdef xmlNode* c_node = NULL
  763. cdef xmlNode* c_next
  764. cdef Py_ssize_t step = 0, slicelength = 0
  765. _assertValidNode(self)
  766. if isinstance(x, slice):
  767. # slice deletion
  768. if _isFullSlice(<slice>x):
  769. c_node = self._c_node.children
  770. if c_node is not NULL:
  771. if not _isElement(c_node):
  772. c_node = _nextElement(c_node)
  773. while c_node is not NULL:
  774. c_next = _nextElement(c_node)
  775. _removeNode(self._doc, c_node)
  776. c_node = c_next
  777. else:
  778. _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
  779. _deleteSlice(self._doc, c_node, slicelength, step)
  780. else:
  781. # item deletion
  782. c_node = _findChild(self._c_node, x)
  783. if c_node is NULL:
  784. raise IndexError, f"index out of range: {x}"
  785. _removeNode(self._doc, c_node)
  786. def __deepcopy__(self, memo):
  787. "__deepcopy__(self, memo)"
  788. return self.__copy__()
  789. def __copy__(self):
  790. "__copy__(self)"
  791. cdef xmlDoc* c_doc
  792. cdef xmlNode* c_node
  793. cdef _Document new_doc
  794. _assertValidNode(self)
  795. c_doc = _copyDocRoot(self._doc._c_doc, self._c_node) # recursive
  796. new_doc = _documentFactory(c_doc, self._doc._parser)
  797. root = new_doc.getroot()
  798. if root is not None:
  799. return root
  800. # Comment/PI
  801. c_node = c_doc.children
  802. while c_node is not NULL and c_node.type != self._c_node.type:
  803. c_node = c_node.next
  804. if c_node is NULL:
  805. return None
  806. return _elementFactory(new_doc, c_node)
  807. def set(self, key, value):
  808. """set(self, key, value)
  809. Sets an element attribute.
  810. In HTML documents (not XML or XHTML), the value None is allowed and creates
  811. an attribute without value (just the attribute name).
  812. """
  813. _assertValidNode(self)
  814. _setAttributeValue(self, key, value)
  815. def append(self, _Element element not None):
  816. """append(self, element)
  817. Adds a subelement to the end of this element.
  818. """
  819. _assertValidNode(self)
  820. _assertValidNode(element)
  821. _appendChild(self, element)
  822. def addnext(self, _Element element not None):
  823. """addnext(self, element)
  824. Adds the element as a following sibling directly after this
  825. element.
  826. This is normally used to set a processing instruction or comment after
  827. the root node of a document. Note that tail text is automatically
  828. discarded when adding at the root level.
  829. """
  830. _assertValidNode(self)
  831. _assertValidNode(element)
  832. if self._c_node.parent != NULL and not _isElement(self._c_node.parent):
  833. if element._c_node.type not in (tree.XML_PI_NODE, tree.XML_COMMENT_NODE):
  834. raise TypeError, "Only processing instructions and comments can be siblings of the root element"
  835. element.tail = None
  836. _appendSibling(self, element)
  837. def addprevious(self, _Element element not None):
  838. """addprevious(self, element)
  839. Adds the element as a preceding sibling directly before this
  840. element.
  841. This is normally used to set a processing instruction or comment
  842. before the root node of a document. Note that tail text is
  843. automatically discarded when adding at the root level.
  844. """
  845. _assertValidNode(self)
  846. _assertValidNode(element)
  847. if self._c_node.parent != NULL and not _isElement(self._c_node.parent):
  848. if element._c_node.type != tree.XML_PI_NODE:
  849. if element._c_node.type != tree.XML_COMMENT_NODE:
  850. raise TypeError, "Only processing instructions and comments can be siblings of the root element"
  851. element.tail = None
  852. _prependSibling(self, element)
  853. def extend(self, elements):
  854. """extend(self, elements)
  855. Extends the current children by the elements in the iterable.
  856. """
  857. cdef _Element element
  858. _assertValidNode(self)
  859. for element in elements:
  860. if element is None:
  861. raise TypeError, "Node must not be None"
  862. _assertValidNode(element)
  863. _appendChild(self, element)
  864. def clear(self, bint keep_tail=False):
  865. """clear(self, keep_tail=False)
  866. Resets an element. This function removes all subelements, clears
  867. all attributes and sets the text and tail properties to None.
  868. Pass ``keep_tail=True`` to leave the tail text untouched.
  869. """
  870. cdef xmlAttr* c_attr
  871. cdef xmlAttr* c_attr_next
  872. cdef xmlNode* c_node
  873. cdef xmlNode* c_node_next
  874. _assertValidNode(self)
  875. c_node = self._c_node
  876. # remove self.text and self.tail
  877. _removeText(c_node.children)
  878. if not keep_tail:
  879. _removeText(c_node.next)
  880. # remove all attributes
  881. c_attr = c_node.properties
  882. if c_attr:
  883. c_node.properties = NULL
  884. tree.xmlFreePropList(c_attr)
  885. # remove all subelements
  886. c_node = c_node.children
  887. if c_node and not _isElement(c_node):
  888. c_node = _nextElement(c_node)
  889. while c_node is not NULL:
  890. c_node_next = _nextElement(c_node)
  891. _removeNode(self._doc, c_node)
  892. c_node = c_node_next
  893. def insert(self, index: int, _Element element not None):
  894. """insert(self, index, element)
  895. Inserts a subelement at the given position in this element
  896. """
  897. cdef xmlNode* c_node
  898. cdef xmlNode* c_next
  899. cdef xmlDoc* c_source_doc
  900. _assertValidNode(self)
  901. _assertValidNode(element)
  902. c_node = _findChild(self._c_node, index)
  903. if c_node is NULL:
  904. _appendChild(self, element)
  905. return
  906. # prevent cycles
  907. if _isAncestorOrSame(element._c_node, self._c_node):
  908. raise ValueError("cannot append parent to itself")
  909. c_source_doc = element._c_node.doc
  910. c_next = element._c_node.next
  911. tree.xmlAddPrevSibling(c_node, element._c_node)
  912. _moveTail(c_next, element._c_node)
  913. moveNodeToDocument(self._doc, c_source_doc, element._c_node)
  914. def remove(self, _Element element not None):
  915. """remove(self, element)
  916. Removes a matching subelement. Unlike the find methods, this
  917. method compares elements based on identity, not on tag value
  918. or contents.
  919. """
  920. cdef xmlNode* c_node
  921. cdef xmlNode* c_next
  922. _assertValidNode(self)
  923. _assertValidNode(element)
  924. c_node = element._c_node
  925. if c_node.parent is not self._c_node:
  926. raise ValueError, "Element is not a child of this node."
  927. c_next = element._c_node.next
  928. tree.xmlUnlinkNode(c_node)
  929. _moveTail(c_next, c_node)
  930. # fix namespace declarations
  931. moveNodeToDocument(self._doc, c_node.doc, c_node)
  932. def replace(self, _Element old_element not None,
  933. _Element new_element not None):
  934. """replace(self, old_element, new_element)
  935. Replaces a subelement with the element passed as second argument.
  936. """
  937. cdef xmlNode* c_old_node
  938. cdef xmlNode* c_old_next
  939. cdef xmlNode* c_new_node
  940. cdef xmlNode* c_new_next
  941. cdef xmlDoc* c_source_doc
  942. _assertValidNode(self)
  943. _assertValidNode(old_element)
  944. _assertValidNode(new_element)
  945. c_old_node = old_element._c_node
  946. if c_old_node.parent is not self._c_node:
  947. raise ValueError, "Element is not a child of this node."
  948. c_new_node = new_element._c_node
  949. # prevent cycles
  950. if _isAncestorOrSame(c_new_node, self._c_node):
  951. raise ValueError("cannot append parent to itself")
  952. # replace node
  953. c_old_next = c_old_node.next
  954. c_new_next = c_new_node.next
  955. c_source_doc = c_new_node.doc
  956. tree.xmlReplaceNode(c_old_node, c_new_node)
  957. _moveTail(c_new_next, c_new_node)
  958. _moveTail(c_old_next, c_old_node)
  959. moveNodeToDocument(self._doc, c_source_doc, c_new_node)
  960. # fix namespace declarations
  961. moveNodeToDocument(self._doc, c_old_node.doc, c_old_node)
  962. # PROPERTIES
  963. property tag:
  964. """Element tag
  965. """
  966. def __get__(self):
  967. if self._tag is not None:
  968. return self._tag
  969. _assertValidNode(self)
  970. self._tag = _namespacedName(self._c_node)
  971. return self._tag
  972. def __set__(self, value):
  973. cdef _BaseParser parser
  974. _assertValidNode(self)
  975. ns, name = _getNsTag(value)
  976. parser = self._doc._parser
  977. if parser is not None and parser._for_html:
  978. _htmlTagValidOrRaise(name)
  979. else:
  980. _tagValidOrRaise(name)
  981. self._tag = value
  982. tree.xmlNodeSetName(self._c_node, _xcstr(name))
  983. if ns is None:
  984. self._c_node.ns = NULL
  985. else:
  986. self._doc._setNodeNs(self._c_node, _xcstr(ns))
  987. @property
  988. def attrib(self):
  989. """Element attribute dictionary. Where possible, use get(), set(),
  990. keys(), values() and items() to access element attributes.
  991. """
  992. return _Attrib.__new__(_Attrib, self)
  993. property text:
  994. """Text before the first subelement. This is either a string or
  995. the value None, if there was no text.
  996. """
  997. def __get__(self):
  998. _assertValidNode(self)
  999. return _collectText(self._c_node.children)
  1000. def __set__(self, value):
  1001. _assertValidNode(self)
  1002. if isinstance(value, QName):
  1003. value = _resolveQNameText(self, value).decode('utf8')
  1004. _setNodeText(self._c_node, value)
  1005. # using 'del el.text' is the wrong thing to do
  1006. #def __del__(self):
  1007. # _setNodeText(self._c_node, None)
  1008. property tail:
  1009. """Text after this element's end tag, but before the next sibling
  1010. element's start tag. This is either a string or the value None, if
  1011. there was no text.
  1012. """
  1013. def __get__(self):
  1014. _assertValidNode(self)
  1015. return _collectText(self._c_node.next)
  1016. def __set__(self, value):
  1017. _assertValidNode(self)
  1018. _setTailText(self._c_node, value)
  1019. # using 'del el.tail' is the wrong thing to do
  1020. #def __del__(self):
  1021. # _setTailText(self._c_node, None)
  1022. # not in ElementTree, read-only
  1023. @property
  1024. def prefix(self):
  1025. """Namespace prefix or None.
  1026. """
  1027. if self._c_node.ns is not NULL:
  1028. if self._c_node.ns.prefix is not NULL:
  1029. return funicode(self._c_node.ns.prefix)
  1030. return None
  1031. # not in ElementTree, read-only
  1032. property sourceline:
  1033. """Original line number as found by the parser or None if unknown.
  1034. """
  1035. def __get__(self):
  1036. cdef long line
  1037. _assertValidNode(self)
  1038. line = tree.xmlGetLineNo(self._c_node)
  1039. return line if line > 0 else None
  1040. def __set__(self, line):
  1041. _assertValidNode(self)
  1042. if line <= 0:
  1043. self._c_node.line = 0
  1044. else:
  1045. self._c_node.line = line
  1046. # not in ElementTree, read-only
  1047. @property
  1048. def nsmap(self):
  1049. """Namespace prefix->URI mapping known in the context of this
  1050. Element. This includes all namespace declarations of the
  1051. parents.
  1052. Note that changing the returned dict has no effect on the Element.
  1053. """
  1054. _assertValidNode(self)
  1055. return _build_nsmap(self._c_node)
  1056. # not in ElementTree, read-only
  1057. property base:
  1058. """The base URI of the Element (xml:base or HTML base URL).
  1059. None if the base URI is unknown.
  1060. Note that the value depends on the URL of the document that
  1061. holds the Element if there is no xml:base attribute on the
  1062. Element or its ancestors.
  1063. Setting this property will set an xml:base attribute on the
  1064. Element, regardless of the document type (XML or HTML).
  1065. """
  1066. def __get__(self):
  1067. _assertValidNode(self)
  1068. c_base = tree.xmlNodeGetBase(self._doc._c_doc, self._c_node)
  1069. if c_base is NULL:
  1070. if self._doc._c_doc.URL is NULL:
  1071. return None
  1072. return _decodeFilename(self._doc._c_doc.URL)
  1073. try:
  1074. base = _decodeFilename(c_base)
  1075. finally:
  1076. tree.xmlFree(c_base)
  1077. return base
  1078. def __set__(self, url):
  1079. _assertValidNode(self)
  1080. if url is None:
  1081. c_base = <const_xmlChar*>NULL
  1082. else:
  1083. url = _encodeFilename(url)
  1084. c_base = _xcstr(url)
  1085. tree.xmlNodeSetBase(self._c_node, c_base)
  1086. # ACCESSORS
  1087. def __repr__(self):
  1088. "__repr__(self)"
  1089. return "<Element %s at 0x%x>" % (self.tag, id(self))
  1090. def __getitem__(self, x):
  1091. """Returns the subelement at the given position or the requested
  1092. slice.
  1093. """
  1094. cdef xmlNode* c_node = NULL
  1095. cdef Py_ssize_t step = 0, slicelength = 0
  1096. cdef Py_ssize_t c, i
  1097. cdef _node_to_node_function next_element
  1098. cdef list result
  1099. _assertValidNode(self)
  1100. if isinstance(x, slice):
  1101. # slicing
  1102. if _isFullSlice(<slice>x):
  1103. return _collectChildren(self)
  1104. _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
  1105. if c_node is NULL:
  1106. return []
  1107. if step > 0:
  1108. next_element = _nextElement
  1109. else:
  1110. step = -step
  1111. next_element = _previousElement
  1112. result = []
  1113. c = 0
  1114. while c_node is not NULL and c < slicelength:
  1115. result.append(_elementFactory(self._doc, c_node))
  1116. c += 1
  1117. for i in range(step):
  1118. c_node = next_element(c_node)
  1119. if c_node is NULL:
  1120. break
  1121. return result
  1122. else:
  1123. # indexing
  1124. c_node = _findChild(self._c_node, x)
  1125. if c_node is NULL:
  1126. raise IndexError, "list index out of range"
  1127. return _elementFactory(self._doc, c_node)
  1128. def __len__(self):
  1129. """__len__(self)
  1130. Returns the number of subelements.
  1131. """
  1132. _assertValidNode(self)
  1133. return _countElements(self._c_node.children)
  1134. def __bool__(self):
  1135. """__bool__(self)"""
  1136. import warnings
  1137. warnings.warn(
  1138. "Truth-testing of elements was a source of confusion and will always "
  1139. "return True in future versions. "
  1140. "Use specific 'len(elem)' or 'elem is not None' test instead.",
  1141. FutureWarning
  1142. )
  1143. # emulate old behaviour
  1144. _assertValidNode(self)
  1145. return _hasChild(self._c_node)
  1146. def __contains__(self, element):
  1147. "__contains__(self, element)"
  1148. cdef xmlNode* c_node
  1149. _assertValidNode(self)
  1150. if not isinstance(element, _Element):
  1151. return 0
  1152. c_node = (<_Element>element)._c_node
  1153. return c_node is not NULL and c_node.parent is self._c_node
  1154. def __iter__(self):
  1155. "__iter__(self)"
  1156. return ElementChildIterator(self)
  1157. def __reversed__(self):
  1158. "__reversed__(self)"
  1159. return ElementChildIterator(self, reversed=True)
  1160. def index(self, child: _Element, start: int = None, stop: int = None):
  1161. """index(self, child, start=None, stop=None)
  1162. Find the position of the child within the parent.
  1163. This method is not part of the original ElementTree API.
  1164. """
  1165. cdef Py_ssize_t k, l
  1166. cdef Py_ssize_t c_start, c_stop
  1167. cdef xmlNode* c_child
  1168. cdef xmlNode* c_start_node
  1169. _assertValidNode(self)
  1170. _assertValidNode(child)
  1171. c_child = child._c_node
  1172. if c_child.parent is not self._c_node:
  1173. raise ValueError, "Element is not a child of this node."
  1174. # handle the unbounded search straight away (normal case)
  1175. if stop is None and (start is None or start == 0):
  1176. k = 0
  1177. c_child = c_child.prev
  1178. while c_child is not NULL:
  1179. if _isElement(c_child):
  1180. k += 1
  1181. c_child = c_child.prev
  1182. return k
  1183. # check indices
  1184. if start is None:
  1185. c_start = 0
  1186. else:
  1187. c_start = start
  1188. if stop is None:
  1189. c_stop = 0
  1190. else:
  1191. c_stop = stop
  1192. if c_stop == 0 or \
  1193. c_start >= c_stop and (c_stop > 0 or c_start < 0):
  1194. raise ValueError, "list.index(x): x not in slice"
  1195. # for negative slice indices, check slice before searching index
  1196. if c_start < 0 or c_stop < 0:
  1197. # start from right, at most up to leftmost(c_start, c_stop)
  1198. if c_start < c_stop:
  1199. k = -c_start
  1200. else:
  1201. k = -c_stop
  1202. c_start_node = self._c_node.last
  1203. l = 1
  1204. while c_start_node != c_child and l < k:
  1205. if _isElement(c_start_node):
  1206. l += 1
  1207. c_start_node = c_start_node.prev
  1208. if c_start_node == c_child:
  1209. # found! before slice end?
  1210. if c_stop < 0 and l <= -c_stop:
  1211. raise ValueError, "list.index(x): x not in slice"
  1212. elif c_start < 0:
  1213. raise ValueError, "list.index(x): x not in slice"
  1214. # now determine the index backwards from child
  1215. c_child = c_child.prev
  1216. k = 0
  1217. if c_stop > 0:
  1218. # we can optimize: stop after c_stop elements if not found
  1219. while c_child != NULL and k < c_stop:
  1220. if _isElement(c_child):
  1221. k += 1
  1222. c_child = c_child.prev
  1223. if k < c_stop:
  1224. return k
  1225. else:
  1226. # traverse all
  1227. while c_child != NULL:
  1228. if _isElement(c_child):
  1229. k = k + 1
  1230. c_child = c_child.prev
  1231. if c_start > 0:
  1232. if k >= c_start:
  1233. return k
  1234. else:
  1235. return k
  1236. if c_start != 0 or c_stop != 0:
  1237. raise ValueError, "list.index(x): x not in slice"
  1238. else:
  1239. raise ValueError, "list.index(x): x not in list"
  1240. def get(self, key, default=None):
  1241. """get(self, key, default=None)
  1242. Gets an element attribute.
  1243. """
  1244. _assertValidNode(self)
  1245. return _getAttributeValue(self, key, default)
  1246. def keys(self):
  1247. """keys(self)
  1248. Gets a list of attribute names. The names are returned in an
  1249. arbitrary order (just like for an ordinary Python dictionary).
  1250. """
  1251. _assertValidNode(self)
  1252. return _collectAttributes(self._c_node, 1)
  1253. def values(self):
  1254. """values(self)
  1255. Gets element attribute values as a sequence of strings. The
  1256. attributes are returned in an arbitrary order.
  1257. """
  1258. _assertValidNode(self)
  1259. return _collectAttributes(self._c_node, 2)
  1260. def items(self):
  1261. """items(self)
  1262. Gets element attributes, as a sequence. The attributes are returned in
  1263. an arbitrary order.
  1264. """
  1265. _assertValidNode(self)
  1266. return _collectAttributes(self._c_node, 3)
  1267. def getchildren(self):
  1268. """getchildren(self)
  1269. Returns all direct children. The elements are returned in document
  1270. order.
  1271. :deprecated: Note that this method has been deprecated as of
  1272. ElementTree 1.3 and lxml 2.0. New code should use
  1273. ``list(element)`` or simply iterate over elements.
  1274. """
  1275. _assertValidNode(self)
  1276. return _collectChildren(self)
  1277. def getparent(self):
  1278. """getparent(self)
  1279. Returns the parent of this element or None for the root element.
  1280. """
  1281. cdef xmlNode* c_node
  1282. #_assertValidNode(self) # not needed
  1283. c_node = _parentElement(self._c_node)
  1284. if c_node is NULL:
  1285. return None
  1286. return _elementFactory(self._doc, c_node)
  1287. def getnext(self):
  1288. """getnext(self)
  1289. Returns the following sibling of this element or None.
  1290. """
  1291. cdef xmlNode* c_node
  1292. #_assertValidNode(self) # not needed
  1293. c_node = _nextElement(self._c_node)
  1294. if c_node is NULL:
  1295. return None
  1296. return _elementFactory(self._doc, c_node)
  1297. def getprevious(self):
  1298. """getprevious(self)
  1299. Returns the preceding sibling of this element or None.
  1300. """
  1301. cdef xmlNode* c_node
  1302. #_assertValidNode(self) # not needed
  1303. c_node = _previousElement(self._c_node)
  1304. if c_node is NULL:
  1305. return None
  1306. return _elementFactory(self._doc, c_node)
  1307. def itersiblings(self, tag=None, *tags, preceding=False):
  1308. """itersiblings(self, tag=None, *tags, preceding=False)
  1309. Iterate over the following or preceding siblings of this element.
  1310. The direction is determined by the 'preceding' keyword which
  1311. defaults to False, i.e. forward iteration over the following
  1312. siblings. When True, the iterator yields the preceding
  1313. siblings in reverse document order, i.e. starting right before
  1314. the current element and going backwards.
  1315. Can be restricted to find only elements with specific tags,
  1316. see `iter`.
  1317. """
  1318. if preceding:
  1319. if self._c_node and not self._c_node.prev:
  1320. return ITER_EMPTY
  1321. elif self._c_node and not self._c_node.next:
  1322. return ITER_EMPTY
  1323. if tag is not None:
  1324. tags += (tag,)
  1325. return SiblingsIterator(self, tags, preceding=preceding)
  1326. def iterancestors(self, tag=None, *tags):
  1327. """iterancestors(self, tag=None, *tags)
  1328. Iterate over the ancestors of this element (from parent to parent).
  1329. Can be restricted to find only elements with specific tags,
  1330. see `iter`.
  1331. """
  1332. if self._c_node and not self._c_node.parent:
  1333. return ITER_EMPTY
  1334. if tag is not None:
  1335. tags += (tag,)
  1336. return AncestorsIterator(self, tags)
  1337. def iterdescendants(self, tag=None, *tags):
  1338. """iterdescendants(self, tag=None, *tags)
  1339. Iterate over the descendants of this element in document order.
  1340. As opposed to ``el.iter()``, this iterator does not yield the element
  1341. itself. The returned elements can be restricted to find only elements
  1342. with specific tags, see `iter`.
  1343. """
  1344. if self._c_node and not self._c_node.children:
  1345. return ITER_EMPTY
  1346. if tag is not None:
  1347. tags += (tag,)
  1348. return ElementDepthFirstIterator(self, tags, inclusive=False)
  1349. def iterchildren(self, tag=None, *tags, reversed=False):
  1350. """iterchildren(self, tag=None, *tags, reversed=False)
  1351. Iterate over the children of this element.
  1352. As opposed to using normal iteration on this element, the returned
  1353. elements can be reversed with the 'reversed' keyword and restricted
  1354. to find only elements with specific tags, see `iter`.
  1355. """
  1356. if self._c_node and not self._c_node.children:
  1357. return ITER_EMPTY
  1358. if tag is not None:
  1359. tags += (tag,)
  1360. return ElementChildIterator(self, tags, reversed=reversed)
  1361. def getroottree(self):
  1362. """getroottree(self)
  1363. Return an ElementTree for the root node of the document that
  1364. contains this element.
  1365. This is the same as following element.getparent() up the tree until it
  1366. returns None (for the root element) and then build an ElementTree for
  1367. the last parent that was returned."""
  1368. _assertValidDoc(self._doc)
  1369. return _elementTreeFactory(self._doc, None)
  1370. def getiterator(self, tag=None, *tags):
  1371. """getiterator(self, tag=None, *tags)
  1372. Returns a sequence or iterator of all elements in the subtree in
  1373. document order (depth first pre-order), starting with this
  1374. element.
  1375. Can be restricted to find only elements with specific tags,
  1376. see `iter`.
  1377. :deprecated: Note that this method is deprecated as of
  1378. ElementTree 1.3 and lxml 2.0. It returns an iterator in
  1379. lxml, which diverges from the original ElementTree
  1380. behaviour. If you want an efficient iterator, use the
  1381. ``element.iter()`` method instead. You should only use this
  1382. method in new code if you require backwards compatibility
  1383. with older versions of lxml or ElementTree.
  1384. """
  1385. if tag is not None:
  1386. tags += (tag,)
  1387. return ElementDepthFirstIterator(self, tags)
  1388. def iter(self, tag=None, *tags):
  1389. """iter(self, tag=None, *tags)
  1390. Iterate over all elements in the subtree in document order (depth
  1391. first pre-order), starting with this element.
  1392. Can be restricted to find only elements with specific tags:
  1393. pass ``"{ns}localname"`` as tag. Either or both of ``ns`` and
  1394. ``localname`` can be ``*`` for a wildcard; ``ns`` can be empty
  1395. for no namespace. ``"localname"`` is equivalent to ``"{}localname"``
  1396. (i.e. no namespace) but ``"*"`` is ``"{*}*"`` (any or no namespace),
  1397. not ``"{}*"``.
  1398. You can also pass the Element, Comment, ProcessingInstruction and
  1399. Entity factory functions to look only for the specific element type.
  1400. Passing multiple tags (or a sequence of tags) instead of a single tag
  1401. will let the iterator return all elements matching any of these tags,
  1402. in document order.
  1403. """
  1404. if tag is not None:
  1405. tags += (tag,)
  1406. return ElementDepthFirstIterator(self, tags)
  1407. def itertext(self, tag=None, *tags, with_tail=True):
  1408. """itertext(self, tag=None, *tags, with_tail=True)
  1409. Iterates over the text content of a subtree.
  1410. You can pass tag names to restrict text content to specific elements,
  1411. see `iter`.
  1412. You can set the ``with_tail`` keyword argument to ``False`` to skip
  1413. over tail text.
  1414. """
  1415. if tag is not None:
  1416. tags += (tag,)
  1417. return ElementTextIterator(self, tags, with_tail=with_tail)
  1418. def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
  1419. """makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
  1420. Creates a new element associated with the same document.
  1421. """
  1422. _assertValidDoc(self._doc)
  1423. return _makeElement(_tag, NULL, self._doc, None, None, None,
  1424. attrib, nsmap, _extra)
  1425. def find(self, path, namespaces=None):
  1426. """find(self, path, namespaces=None)
  1427. Finds the first matching subelement, by tag name or path.
  1428. The optional ``namespaces`` argument accepts a
  1429. prefix-to-namespace mapping that allows the usage of XPath
  1430. prefixes in the path expression.
  1431. """
  1432. if isinstance(path, QName):
  1433. path = (<QName>path).text
  1434. return _elementpath.find(self, path, namespaces, with_prefixes=not _isHtmlDocument(self))
  1435. def findtext(self, path, default=None, namespaces=None):
  1436. """findtext(self, path, default=None, namespaces=None)
  1437. Finds text for the first matching subelement, by tag name or path.
  1438. The optional ``namespaces`` argument accepts a
  1439. prefix-to-namespace mapping that allows the usage of XPath
  1440. prefixes in the path expression.
  1441. """
  1442. if isinstance(path, QName):
  1443. path = (<QName>path).text
  1444. return _elementpath.findtext(self, path, default, namespaces, with_prefixes=not _isHtmlDocument(self))
  1445. def findall(self, path, namespaces=None):
  1446. """findall(self, path, namespaces=None)
  1447. Finds all matching subelements, by tag name or path.
  1448. The optional ``namespaces`` argument accepts a
  1449. prefix-to-namespace mapping that allows the usage of XPath
  1450. prefixes in the path expression.
  1451. """
  1452. if isinstance(path, QName):
  1453. path = (<QName>path).text
  1454. return _elementpath.findall(self, path, namespaces, with_prefixes=not _isHtmlDocument(self))
  1455. def iterfind(self, path, namespaces=None):
  1456. """iterfind(self, path, namespaces=None)
  1457. Iterates over all matching subelements, by tag name or path.
  1458. The optional ``namespaces`` argument accepts a
  1459. prefix-to-namespace mapping that allows the usage of XPath
  1460. prefixes in the path expression.
  1461. """
  1462. if isinstance(path, QName):
  1463. path = (<QName>path).text
  1464. return _elementpath.iterfind(self, path, namespaces, with_prefixes=not _isHtmlDocument(self))
  1465. def xpath(self, _path, *, namespaces=None, extensions=None,
  1466. smart_strings=True, **_variables):
  1467. """xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
  1468. Evaluate an xpath expression using the element as context node.
  1469. """
  1470. evaluator = XPathElementEvaluator(self, namespaces=namespaces,
  1471. extensions=extensions,
  1472. smart_strings=smart_strings)
  1473. return evaluator(_path, **_variables)
  1474. def cssselect(self, expr, *, translator='xml'):
  1475. """
  1476. Run the CSS expression on this element and its children,
  1477. returning a list of the results.
  1478. Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
  1479. that pre-compiling the expression can provide a substantial
  1480. speedup.
  1481. """
  1482. # Do the import here to make the dependency optional.
  1483. from lxml.cssselect import CSSSelector
  1484. return CSSSelector(expr, translator=translator)(self)
  1485. @cython.linetrace(False)
  1486. cdef _Element _elementFactory(_Document doc, xmlNode* c_node):
  1487. cdef _Element result
  1488. result = getProxy(c_node)
  1489. if result is not None:
  1490. return result
  1491. if c_node is NULL:
  1492. return None
  1493. element_class = <type> LOOKUP_ELEMENT_CLASS(
  1494. ELEMENT_CLASS_LOOKUP_STATE, doc, c_node)
  1495. if type(element_class) is not type:
  1496. if not isinstance(element_class, type):
  1497. raise TypeError(f"Element class is not a type, got {type(element_class)}")
  1498. if hasProxy(c_node):
  1499. # prevent re-entry race condition - we just called into Python
  1500. return getProxy(c_node)
  1501. result = element_class.__new__(element_class)
  1502. if hasProxy(c_node):
  1503. # prevent re-entry race condition - we just called into Python
  1504. result._c_node = NULL
  1505. return getProxy(c_node)
  1506. _registerProxy(result, doc, c_node)
  1507. if element_class is not _Element:
  1508. result._init()
  1509. return result
  1510. @cython.internal
  1511. cdef class __ContentOnlyElement(_Element):
  1512. cdef int _raiseImmutable(self) except -1:
  1513. raise TypeError, "this element does not have children or attributes"
  1514. def set(self, key, value):
  1515. "set(self, key, value)"
  1516. self._raiseImmutable()
  1517. def append(self, value):
  1518. "append(self, value)"
  1519. self._raiseImmutable()
  1520. def insert(self, index, value):
  1521. "insert(self, index, value)"
  1522. self._raiseImmutable()
  1523. def __setitem__(self, index, value):
  1524. "__setitem__(self, index, value)"
  1525. self._raiseImmutable()
  1526. @property
  1527. def attrib(self):
  1528. return IMMUTABLE_EMPTY_MAPPING
  1529. property text:
  1530. def __get__(self):
  1531. _assertValidNode(self)
  1532. return funicodeOrEmpty(self._c_node.content)
  1533. def __set__(self, value):
  1534. cdef tree.xmlDict* c_dict
  1535. _assertValidNode(self)
  1536. if value is None:
  1537. c_text = <const_xmlChar*>NULL
  1538. else:
  1539. value = _utf8(value)
  1540. c_text = _xcstr(value)
  1541. tree.xmlNodeSetContent(self._c_node, c_text)
  1542. # ACCESSORS
  1543. def __getitem__(self, x):
  1544. "__getitem__(self, x)"
  1545. if isinstance(x, slice):
  1546. return []
  1547. else:
  1548. raise IndexError, "list index out of range"
  1549. def __len__(self):
  1550. "__len__(self)"
  1551. return 0
  1552. def get(self, key, default=None):
  1553. "get(self, key, default=None)"
  1554. return None
  1555. def keys(self):
  1556. "keys(self)"
  1557. return []
  1558. def items(self):
  1559. "items(self)"
  1560. return []
  1561. def values(self):
  1562. "values(self)"
  1563. return []
  1564. cdef class _Comment(__ContentOnlyElement):
  1565. @property
  1566. def tag(self):
  1567. return Comment
  1568. def __repr__(self):
  1569. return "<!--%s-->" % self.text
  1570. cdef class _ProcessingInstruction(__ContentOnlyElement):
  1571. @property
  1572. def tag(self):
  1573. return ProcessingInstruction
  1574. property target:
  1575. # not in ElementTree
  1576. def __get__(self):
  1577. _assertValidNode(self)
  1578. return funicode(self._c_node.name)
  1579. def __set__(self, value):
  1580. _assertValidNode(self)
  1581. value = _utf8(value)
  1582. c_text = _xcstr(value)
  1583. tree.xmlNodeSetName(self._c_node, c_text)
  1584. def __repr__(self):
  1585. text = self.text
  1586. if text:
  1587. return "<?%s %s?>" % (self.target, text)
  1588. else:
  1589. return "<?%s?>" % self.target
  1590. def get(self, key, default=None):
  1591. """get(self, key, default=None)
  1592. Try to parse pseudo-attributes from the text content of the
  1593. processing instruction, search for one with the given key as
  1594. name and return its associated value.
  1595. Note that this is only a convenience method for the most
  1596. common case that all text content is structured in
  1597. attribute-like name-value pairs with properly quoted values.
  1598. It is not guaranteed to work for all possible text content.
  1599. """
  1600. return self.attrib.get(key, default)
  1601. @property
  1602. def attrib(self):
  1603. """Returns a dict containing all pseudo-attributes that can be
  1604. parsed from the text content of this processing instruction.
  1605. Note that modifying the dict currently has no effect on the
  1606. XML node, although this is not guaranteed to stay this way.
  1607. """
  1608. return { attr : (value1 or value2)
  1609. for attr, value1, value2 in _FIND_PI_ATTRIBUTES(' ' + self.text) }
  1610. cdef object _FIND_PI_ATTRIBUTES = re.compile(r'\s+(\w+)\s*=\s*(?:\'([^\']*)\'|"([^"]*)")', re.U).findall
  1611. cdef class _Entity(__ContentOnlyElement):
  1612. @property
  1613. def tag(self):
  1614. return Entity
  1615. property name:
  1616. # not in ElementTree
  1617. def __get__(self):
  1618. _assertValidNode(self)
  1619. return funicode(self._c_node.name)
  1620. def __set__(self, value):
  1621. _assertValidNode(self)
  1622. value_utf = _utf8(value)
  1623. if b'&' in value_utf or b';' in value_utf:
  1624. raise ValueError, f"Invalid entity name '{value}'"
  1625. tree.xmlNodeSetName(self._c_node, _xcstr(value_utf))
  1626. @property
  1627. def text(self):
  1628. # FIXME: should this be None or '&[VALUE];' or the resolved
  1629. # entity value ?
  1630. _assertValidNode(self)
  1631. return f'&{funicode(self._c_node.name)};'
  1632. def __repr__(self):
  1633. return "&%s;" % self.name
  1634. cdef class QName:
  1635. """QName(text_or_uri_or_element, tag=None)
  1636. QName wrapper for qualified XML names.
  1637. Pass a tag name by itself or a namespace URI and a tag name to
  1638. create a qualified name. Alternatively, pass an Element to
  1639. extract its tag name. ``None`` as first argument is ignored in
  1640. order to allow for generic 2-argument usage.
  1641. The ``text`` property holds the qualified name in
  1642. ``{namespace}tagname`` notation. The ``namespace`` and
  1643. ``localname`` properties hold the respective parts of the tag
  1644. name.
  1645. You can pass QName objects wherever a tag name is expected. Also,
  1646. setting Element text from a QName will resolve the namespace prefix
  1647. on assignment and set a qualified text value. This is helpful in XML
  1648. languages like SOAP or XML-Schema that use prefixed tag names in
  1649. their text content.
  1650. """
  1651. cdef readonly unicode text
  1652. cdef readonly unicode localname
  1653. cdef readonly unicode namespace
  1654. def __init__(self, text_or_uri_or_element, tag=None):
  1655. if text_or_uri_or_element is None:
  1656. # Allow None as no namespace.
  1657. text_or_uri_or_element, tag = tag, None
  1658. if not _isString(text_or_uri_or_element):
  1659. if isinstance(text_or_uri_or_element, _Element):
  1660. text_or_uri_or_element = (<_Element>text_or_uri_or_element).tag
  1661. if not _isString(text_or_uri_or_element):
  1662. raise ValueError, f"Invalid input tag of type {type(text_or_uri_or_element)!r}"
  1663. elif isinstance(text_or_uri_or_element, QName):
  1664. text_or_uri_or_element = (<QName>text_or_uri_or_element).text
  1665. elif text_or_uri_or_element is not None:
  1666. text_or_uri_or_element = unicode(text_or_uri_or_element)
  1667. else:
  1668. raise ValueError, f"Invalid input tag of type {type(text_or_uri_or_element)!r}"
  1669. ns_utf, tag_utf = _getNsTag(text_or_uri_or_element)
  1670. if tag is not None:
  1671. # either ('ns', 'tag') or ('{ns}oldtag', 'newtag')
  1672. if ns_utf is None:
  1673. ns_utf = tag_utf # case 1: namespace ended up as tag name
  1674. tag_utf = _utf8(tag)
  1675. _tagValidOrRaise(tag_utf)
  1676. self.localname = (<bytes>tag_utf).decode('utf8')
  1677. if ns_utf is None:
  1678. self.namespace = None
  1679. self.text = self.localname
  1680. else:
  1681. self.namespace = (<bytes>ns_utf).decode('utf8')
  1682. self.text = "{%s}%s" % (self.namespace, self.localname)
  1683. def __str__(self):
  1684. return self.text
  1685. def __hash__(self):
  1686. return hash(self.text)
  1687. def __richcmp__(self, other, int op):
  1688. try:
  1689. if type(other) is QName:
  1690. other = (<QName>other).text
  1691. elif not isinstance(other, unicode):
  1692. other = unicode(other)
  1693. except (ValueError, UnicodeDecodeError):
  1694. return NotImplemented
  1695. return python.PyObject_RichCompare(self.text, other, op)
  1696. cdef public class _ElementTree [ type LxmlElementTreeType,
  1697. object LxmlElementTree ]:
  1698. cdef _Document _doc
  1699. cdef _Element _context_node
  1700. # Note that _doc is only used to store the original document if we do not
  1701. # have a _context_node. All methods should prefer self._context_node._doc
  1702. # to honour tree restructuring. _doc can happily be None!
  1703. @cython.final
  1704. cdef int _assertHasRoot(self) except -1:
  1705. """We have to take care here: the document may not have a root node!
  1706. This can happen if ElementTree() is called without any argument and
  1707. the caller 'forgets' to call parse() afterwards, so this is a bug in
  1708. the caller program.
  1709. """
  1710. assert self._context_node is not None, \
  1711. "ElementTree not initialized, missing root"
  1712. return 0
  1713. def parse(self, source, _BaseParser parser=None, *, base_url=None):
  1714. """parse(self, source, parser=None, base_url=None)
  1715. Updates self with the content of source and returns its root.
  1716. """
  1717. cdef _Document doc = None
  1718. try:
  1719. doc = _parseDocument(source, parser, base_url)
  1720. except _TargetParserResult as result_container:
  1721. # raises a TypeError if we don't get an _Element
  1722. self._context_node = result_container.result
  1723. else:
  1724. self._context_node = doc.getroot()
  1725. self._doc = None if self._context_node is not None else doc
  1726. return self._context_node
  1727. def _setroot(self, _Element root not None):
  1728. """_setroot(self, root)
  1729. Relocate the ElementTree to a new root node.
  1730. """
  1731. _assertValidNode(root)
  1732. if root._c_node.type != tree.XML_ELEMENT_NODE:
  1733. raise TypeError, "Only elements can be the root of an ElementTree"
  1734. self._context_node = root
  1735. self._doc = None
  1736. def getroot(self):
  1737. """getroot(self)
  1738. Gets the root element for this tree.
  1739. """
  1740. return self._context_node
  1741. def __copy__(self):
  1742. return _elementTreeFactory(self._doc, self._context_node)
  1743. def __deepcopy__(self, memo):
  1744. cdef _Element root
  1745. cdef _Document doc
  1746. cdef xmlDoc* c_doc
  1747. if self._context_node is not None:
  1748. root = self._context_node.__copy__()
  1749. assert root is not None
  1750. _assertValidNode(root)
  1751. _copyNonElementSiblings(self._context_node._c_node, root._c_node)
  1752. return _elementTreeFactory(None, root)
  1753. elif self._doc is not None:
  1754. _assertValidDoc(self._doc)
  1755. c_doc = tree.xmlCopyDoc(self._doc._c_doc, 1)
  1756. if c_doc is NULL:
  1757. raise MemoryError()
  1758. doc = _documentFactory(c_doc, self._doc._parser)
  1759. return _elementTreeFactory(doc, None)
  1760. else:
  1761. # so what ...
  1762. return self
  1763. # not in ElementTree
  1764. @property
  1765. def docinfo(self) -> DocInfo:
  1766. """Information about the document provided by parser and DTD."""
  1767. self._assertHasRoot()
  1768. return DocInfo(self._context_node._doc)
  1769. # not in ElementTree, read-only
  1770. @property
  1771. def parser(self):
  1772. """The parser that was used to parse the document in this ElementTree.
  1773. """
  1774. if self._context_node is not None and \
  1775. self._context_node._doc is not None:
  1776. return self._context_node._doc._parser
  1777. if self._doc is not None:
  1778. return self._doc._parser
  1779. return None
  1780. def write(self, file, *, encoding=None, method="xml",
  1781. bint pretty_print=False, xml_declaration=None, bint with_tail=True,
  1782. standalone=None, doctype=None, compression=0,
  1783. bint exclusive=False, inclusive_ns_prefixes=None,
  1784. bint with_comments=True, bint strip_text=False,
  1785. docstring=None):
  1786. """write(self, file, encoding=None, method="xml",
  1787. pretty_print=False, xml_declaration=None, with_tail=True,
  1788. standalone=None, doctype=None, compression=0,
  1789. exclusive=False, inclusive_ns_prefixes=None,
  1790. with_comments=True, strip_text=False)
  1791. Write the tree to a filename, file or file-like object.
  1792. Defaults to ASCII encoding and writing a declaration as needed.
  1793. The keyword argument 'method' selects the output method:
  1794. 'xml', 'html', 'text', 'c14n' or 'c14n2'. Default is 'xml'.
  1795. With ``method="c14n"`` (C14N version 1), the options ``exclusive``,
  1796. ``with_comments`` and ``inclusive_ns_prefixes`` request exclusive
  1797. C14N, include comments, and list the inclusive prefixes respectively.
  1798. With ``method="c14n2"`` (C14N version 2), the ``with_comments`` and
  1799. ``strip_text`` options control the output of comments and text space
  1800. according to C14N 2.0.
  1801. Passing a boolean value to the ``standalone`` option will
  1802. output an XML declaration with the corresponding
  1803. ``standalone`` flag.
  1804. The ``doctype`` option allows passing in a plain string that will
  1805. be serialised before the XML tree. Note that passing in non
  1806. well-formed content here will make the XML output non well-formed.
  1807. Also, an existing doctype in the document tree will not be removed
  1808. when serialising an ElementTree instance.
  1809. The ``compression`` option enables GZip compression level 1-9.
  1810. The ``inclusive_ns_prefixes`` should be a list of namespace strings
  1811. (i.e. ['xs', 'xsi']) that will be promoted to the top-level element
  1812. during exclusive C14N serialisation. This parameter is ignored if
  1813. exclusive mode=False.
  1814. If exclusive=True and no list is provided, a namespace will only be
  1815. rendered if it is used by the immediate parent or one of its attributes
  1816. and its prefix and values have not already been rendered by an ancestor
  1817. of the namespace node's parent element.
  1818. """
  1819. cdef bint write_declaration
  1820. cdef int is_standalone
  1821. self._assertHasRoot()
  1822. _assertValidNode(self._context_node)
  1823. if compression is None or compression < 0:
  1824. compression = 0
  1825. # C14N serialisation
  1826. if method in ('c14n', 'c14n2'):
  1827. if encoding is not None:
  1828. raise ValueError("Cannot specify encoding with C14N")
  1829. if xml_declaration:
  1830. raise ValueError("Cannot enable XML declaration in C14N")
  1831. if method == 'c14n':
  1832. _tofilelikeC14N(file, self._context_node, exclusive, with_comments,
  1833. compression, inclusive_ns_prefixes)
  1834. else: # c14n2
  1835. with _open_utf8_file(file, compression=compression) as f:
  1836. target = C14NWriterTarget(
  1837. f.write, with_comments=with_comments, strip_text=strip_text)
  1838. _tree_to_target(self, target)
  1839. return
  1840. if not with_comments:
  1841. raise ValueError("Can only discard comments in C14N serialisation")
  1842. # suppress decl. in default case (purely for ElementTree compatibility)
  1843. if xml_declaration is not None:
  1844. write_declaration = xml_declaration
  1845. if encoding is None:
  1846. encoding = 'ASCII'
  1847. else:
  1848. encoding = encoding.upper()
  1849. elif encoding is None:
  1850. encoding = 'ASCII'
  1851. write_declaration = 0
  1852. else:
  1853. encoding = encoding.upper()
  1854. write_declaration = encoding not in (
  1855. 'US-ASCII', 'ASCII', 'UTF8', 'UTF-8')
  1856. if standalone is None:
  1857. is_standalone = -1
  1858. elif standalone:
  1859. write_declaration = 1
  1860. is_standalone = 1
  1861. else:
  1862. write_declaration = 1
  1863. is_standalone = 0
  1864. if docstring is not None and doctype is None:
  1865. import warnings
  1866. warnings.warn(
  1867. "The 'docstring' option is deprecated. Use 'doctype' instead.",
  1868. DeprecationWarning)
  1869. doctype = docstring
  1870. _tofilelike(file, self._context_node, encoding, doctype, method,
  1871. write_declaration, 1, pretty_print, with_tail,
  1872. is_standalone, compression)
  1873. def getpath(self, _Element element not None):
  1874. """getpath(self, element)
  1875. Returns a structural, absolute XPath expression to find the element.
  1876. For namespaced elements, the expression uses prefixes from the
  1877. document, which therefore need to be provided in order to make any
  1878. use of the expression in XPath.
  1879. Also see the method getelementpath(self, element), which returns a
  1880. self-contained ElementPath expression.
  1881. """
  1882. cdef _Document doc
  1883. cdef _Element root
  1884. cdef xmlDoc* c_doc
  1885. _assertValidNode(element)
  1886. if self._context_node is not None:
  1887. root = self._context_node
  1888. doc = root._doc
  1889. elif self._doc is not None:
  1890. doc = self._doc
  1891. root = doc.getroot()
  1892. else:
  1893. raise ValueError, "Element is not in this tree."
  1894. _assertValidDoc(doc)
  1895. _assertValidNode(root)
  1896. if element._doc is not doc:
  1897. raise ValueError, "Element is not in this tree."
  1898. c_doc = _fakeRootDoc(doc._c_doc, root._c_node)
  1899. c_path = tree.xmlGetNodePath(element._c_node)
  1900. _destroyFakeDoc(doc._c_doc, c_doc)
  1901. if c_path is NULL:
  1902. raise MemoryError()
  1903. path = funicode(c_path)
  1904. tree.xmlFree(c_path)
  1905. return path
  1906. def getelementpath(self, _Element element not None):
  1907. """getelementpath(self, element)
  1908. Returns a structural, absolute ElementPath expression to find the
  1909. element. This path can be used in the .find() method to look up
  1910. the element, provided that the elements along the path and their
  1911. list of immediate children were not modified in between.
  1912. ElementPath has the advantage over an XPath expression (as returned
  1913. by the .getpath() method) that it does not require additional prefix
  1914. declarations. It is always self-contained.
  1915. """
  1916. cdef _Element root
  1917. cdef Py_ssize_t count
  1918. _assertValidNode(element)
  1919. if element._c_node.type != tree.XML_ELEMENT_NODE:
  1920. raise ValueError, "input is not an Element"
  1921. if self._context_node is not None:
  1922. root = self._context_node
  1923. elif self._doc is not None:
  1924. root = self._doc.getroot()
  1925. else:
  1926. raise ValueError, "Element is not in this tree"
  1927. _assertValidNode(root)
  1928. if element._doc is not root._doc:
  1929. raise ValueError, "Element is not in this tree"
  1930. path = []
  1931. c_element = element._c_node
  1932. while c_element is not root._c_node:
  1933. c_name = c_element.name
  1934. c_href = _getNs(c_element)
  1935. tag = _namespacedNameFromNsName(c_href, c_name)
  1936. if c_href is NULL:
  1937. c_href = <const_xmlChar*>b'' # no namespace (NULL is wildcard)
  1938. # use tag[N] if there are preceding siblings with the same tag
  1939. count = 0
  1940. c_node = c_element.prev
  1941. while c_node is not NULL:
  1942. if c_node.type == tree.XML_ELEMENT_NODE:
  1943. if _tagMatches(c_node, c_href, c_name):
  1944. count += 1
  1945. c_node = c_node.prev
  1946. if count:
  1947. tag = f'{tag}[{count+1}]'
  1948. else:
  1949. # use tag[1] if there are following siblings with the same tag
  1950. c_node = c_element.next
  1951. while c_node is not NULL:
  1952. if c_node.type == tree.XML_ELEMENT_NODE:
  1953. if _tagMatches(c_node, c_href, c_name):
  1954. tag += '[1]'
  1955. break
  1956. c_node = c_node.next
  1957. path.append(tag)
  1958. c_element = c_element.parent
  1959. if c_element is NULL or c_element.type != tree.XML_ELEMENT_NODE:
  1960. raise ValueError, "Element is not in this tree."
  1961. if not path:
  1962. return '.'
  1963. path.reverse()
  1964. return '/'.join(path)
  1965. def getiterator(self, tag=None, *tags):
  1966. """getiterator(self, *tags, tag=None)
  1967. Returns a sequence or iterator of all elements in document order
  1968. (depth first pre-order), starting with the root element.
  1969. Can be restricted to find only elements with specific tags,
  1970. see `_Element.iter`.
  1971. :deprecated: Note that this method is deprecated as of
  1972. ElementTree 1.3 and lxml 2.0. It returns an iterator in
  1973. lxml, which diverges from the original ElementTree
  1974. behaviour. If you want an efficient iterator, use the
  1975. ``tree.iter()`` method instead. You should only use this
  1976. method in new code if you require backwards compatibility
  1977. with older versions of lxml or ElementTree.
  1978. """
  1979. root = self.getroot()
  1980. if root is None:
  1981. return ITER_EMPTY
  1982. if tag is not None:
  1983. tags += (tag,)
  1984. return root.getiterator(*tags)
  1985. def iter(self, tag=None, *tags):
  1986. """iter(self, tag=None, *tags)
  1987. Creates an iterator for the root element. The iterator loops over
  1988. all elements in this tree, in document order. Note that siblings
  1989. of the root element (comments or processing instructions) are not
  1990. returned by the iterator.
  1991. Can be restricted to find only elements with specific tags,
  1992. see `_Element.iter`.
  1993. """
  1994. root = self.getroot()
  1995. if root is None:
  1996. return ITER_EMPTY
  1997. if tag is not None:
  1998. tags += (tag,)
  1999. return root.iter(*tags)
  2000. def find(self, path, namespaces=None):
  2001. """find(self, path, namespaces=None)
  2002. Finds the first toplevel element with given tag. Same as
  2003. ``tree.getroot().find(path)``.
  2004. The optional ``namespaces`` argument accepts a
  2005. prefix-to-namespace mapping that allows the usage of XPath
  2006. prefixes in the path expression.
  2007. """
  2008. self._assertHasRoot()
  2009. root = self.getroot()
  2010. if _isString(path):
  2011. if path[:1] == "/":
  2012. path = "." + path
  2013. from warnings import warn
  2014. warn(
  2015. "This search incorrectly ignores the root element, and will be "
  2016. "fixed in a future version. If you rely on the current "
  2017. f"behaviour, change it to {path!r}",
  2018. FutureWarning, stacklevel=1
  2019. )
  2020. return root.find(path, namespaces)
  2021. def findtext(self, path, default=None, namespaces=None):
  2022. """findtext(self, path, default=None, namespaces=None)
  2023. Finds the text for the first element matching the ElementPath
  2024. expression. Same as getroot().findtext(path)
  2025. The optional ``namespaces`` argument accepts a
  2026. prefix-to-namespace mapping that allows the usage of XPath
  2027. prefixes in the path expression.
  2028. """
  2029. self._assertHasRoot()
  2030. root = self.getroot()
  2031. if _isString(path):
  2032. if path[:1] == "/":
  2033. path = "." + path
  2034. from warnings import warn
  2035. warn(
  2036. "This search incorrectly ignores the root element, and will be "
  2037. "fixed in a future version. If you rely on the current "
  2038. f"behaviour, change it to {path!r}",
  2039. FutureWarning, stacklevel=1
  2040. )
  2041. return root.findtext(path, default, namespaces)
  2042. def findall(self, path, namespaces=None):
  2043. """findall(self, path, namespaces=None)
  2044. Finds all elements matching the ElementPath expression. Same as
  2045. getroot().findall(path).
  2046. The optional ``namespaces`` argument accepts a
  2047. prefix-to-namespace mapping that allows the usage of XPath
  2048. prefixes in the path expression.
  2049. """
  2050. self._assertHasRoot()
  2051. root = self.getroot()
  2052. if _isString(path):
  2053. if path[:1] == "/":
  2054. path = "." + path
  2055. from warnings import warn
  2056. warn(
  2057. "This search incorrectly ignores the root element, and will be "
  2058. "fixed in a future version. If you rely on the current "
  2059. f"behaviour, change it to {path!r}",
  2060. FutureWarning, stacklevel=1
  2061. )
  2062. return root.findall(path, namespaces)
  2063. def iterfind(self, path, namespaces=None):
  2064. """iterfind(self, path, namespaces=None)
  2065. Iterates over all elements matching the ElementPath expression.
  2066. Same as getroot().iterfind(path).
  2067. The optional ``namespaces`` argument accepts a
  2068. prefix-to-namespace mapping that allows the usage of XPath
  2069. prefixes in the path expression.
  2070. """
  2071. self._assertHasRoot()
  2072. root = self.getroot()
  2073. if _isString(path):
  2074. if path[:1] == "/":
  2075. path = "." + path
  2076. from warnings import warn
  2077. warn(
  2078. "This search incorrectly ignores the root element, and will be "
  2079. "fixed in a future version. If you rely on the current "
  2080. f"behaviour, change it to {path!r}",
  2081. FutureWarning, stacklevel=1
  2082. )
  2083. return root.iterfind(path, namespaces)
  2084. def xpath(self, _path, *, namespaces=None, extensions=None,
  2085. smart_strings=True, **_variables):
  2086. """xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
  2087. XPath evaluate in context of document.
  2088. ``namespaces`` is an optional dictionary with prefix to namespace URI
  2089. mappings, used by XPath. ``extensions`` defines additional extension
  2090. functions.
  2091. Returns a list (nodeset), or bool, float or string.
  2092. In case of a list result, return Element for element nodes,
  2093. string for text and attribute values.
  2094. Note: if you are going to apply multiple XPath expressions
  2095. against the same document, it is more efficient to use
  2096. XPathEvaluator directly.
  2097. """
  2098. self._assertHasRoot()
  2099. evaluator = XPathDocumentEvaluator(self, namespaces=namespaces,
  2100. extensions=extensions,
  2101. smart_strings=smart_strings)
  2102. return evaluator(_path, **_variables)
  2103. def xslt(self, _xslt, extensions=None, access_control=None, **_kw):
  2104. """xslt(self, _xslt, extensions=None, access_control=None, **_kw)
  2105. Transform this document using other document.
  2106. xslt is a tree that should be XSLT
  2107. keyword parameters are XSLT transformation parameters.
  2108. Returns the transformed tree.
  2109. Note: if you are going to apply the same XSLT stylesheet against
  2110. multiple documents, it is more efficient to use the XSLT
  2111. class directly.
  2112. """
  2113. self._assertHasRoot()
  2114. style = XSLT(_xslt, extensions=extensions,
  2115. access_control=access_control)
  2116. return style(self, **_kw)
  2117. def relaxng(self, relaxng):
  2118. """relaxng(self, relaxng)
  2119. Validate this document using other document.
  2120. The relaxng argument is a tree that should contain a Relax NG schema.
  2121. Returns True or False, depending on whether validation
  2122. succeeded.
  2123. Note: if you are going to apply the same Relax NG schema against
  2124. multiple documents, it is more efficient to use the RelaxNG
  2125. class directly.
  2126. """
  2127. self._assertHasRoot()
  2128. schema = RelaxNG(relaxng)
  2129. return schema.validate(self)
  2130. def xmlschema(self, xmlschema):
  2131. """xmlschema(self, xmlschema)
  2132. Validate this document using other document.
  2133. The xmlschema argument is a tree that should contain an XML Schema.
  2134. Returns True or False, depending on whether validation
  2135. succeeded.
  2136. Note: If you are going to apply the same XML Schema against
  2137. multiple documents, it is more efficient to use the XMLSchema
  2138. class directly.
  2139. """
  2140. self._assertHasRoot()
  2141. schema = XMLSchema(xmlschema)
  2142. return schema.validate(self)
  2143. def xinclude(self):
  2144. """xinclude(self)
  2145. Process the XInclude nodes in this document and include the
  2146. referenced XML fragments.
  2147. There is support for loading files through the file system, HTTP and
  2148. FTP.
  2149. Note that XInclude does not support custom resolvers in Python space
  2150. due to restrictions of libxml2 <= 2.6.29.
  2151. """
  2152. self._assertHasRoot()
  2153. XInclude()(self._context_node)
  2154. def write_c14n(self, file, *, bint exclusive=False, bint with_comments=True,
  2155. compression=0, inclusive_ns_prefixes=None):
  2156. """write_c14n(self, file, exclusive=False, with_comments=True,
  2157. compression=0, inclusive_ns_prefixes=None)
  2158. C14N write of document. Always writes UTF-8.
  2159. The ``compression`` option enables GZip compression level 1-9.
  2160. The ``inclusive_ns_prefixes`` should be a list of namespace strings
  2161. (i.e. ['xs', 'xsi']) that will be promoted to the top-level element
  2162. during exclusive C14N serialisation. This parameter is ignored if
  2163. exclusive mode=False.
  2164. If exclusive=True and no list is provided, a namespace will only be
  2165. rendered if it is used by the immediate parent or one of its attributes
  2166. and its prefix and values have not already been rendered by an ancestor
  2167. of the namespace node's parent element.
  2168. NOTE: This method is deprecated as of lxml 4.4 and will be removed in a
  2169. future release. Use ``.write(f, method="c14n")`` instead.
  2170. """
  2171. self._assertHasRoot()
  2172. _assertValidNode(self._context_node)
  2173. if compression is None or compression < 0:
  2174. compression = 0
  2175. _tofilelikeC14N(file, self._context_node, exclusive, with_comments,
  2176. compression, inclusive_ns_prefixes)
  2177. cdef _ElementTree _elementTreeFactory(_Document doc, _Element context_node):
  2178. return _newElementTree(doc, context_node, _ElementTree)
  2179. cdef _ElementTree _newElementTree(_Document doc, _Element context_node,
  2180. object baseclass):
  2181. cdef _ElementTree result
  2182. result = baseclass()
  2183. if context_node is None and doc is not None:
  2184. context_node = doc.getroot()
  2185. if context_node is None:
  2186. _assertValidDoc(doc)
  2187. result._doc = doc
  2188. else:
  2189. _assertValidNode(context_node)
  2190. result._context_node = context_node
  2191. return result
  2192. @cython.final
  2193. @cython.freelist(16)
  2194. cdef class _Attrib:
  2195. """A dict-like proxy for the ``Element.attrib`` property.
  2196. """
  2197. cdef _Element _element
  2198. def __cinit__(self, _Element element not None):
  2199. _assertValidNode(element)
  2200. self._element = element
  2201. # MANIPULATORS
  2202. def __setitem__(self, key, value):
  2203. _assertValidNode(self._element)
  2204. _setAttributeValue(self._element, key, value)
  2205. def __delitem__(self, key):
  2206. _assertValidNode(self._element)
  2207. _delAttribute(self._element, key)
  2208. def update(self, sequence_or_dict):
  2209. _assertValidNode(self._element)
  2210. if isinstance(sequence_or_dict, (dict, _Attrib)):
  2211. sequence_or_dict = sequence_or_dict.items()
  2212. for key, value in sequence_or_dict:
  2213. _setAttributeValue(self._element, key, value)
  2214. def pop(self, key, *default):
  2215. if len(default) > 1:
  2216. raise TypeError, f"pop expected at most 2 arguments, got {len(default)+1}"
  2217. _assertValidNode(self._element)
  2218. result = _getAttributeValue(self._element, key, None)
  2219. if result is None:
  2220. if not default:
  2221. raise KeyError, key
  2222. result = default[0]
  2223. else:
  2224. _delAttribute(self._element, key)
  2225. return result
  2226. def clear(self):
  2227. _assertValidNode(self._element)
  2228. c_attrs = self._element._c_node.properties
  2229. if c_attrs:
  2230. self._element._c_node.properties = NULL
  2231. tree.xmlFreePropList(c_attrs)
  2232. # ACCESSORS
  2233. def __repr__(self):
  2234. _assertValidNode(self._element)
  2235. return repr(dict( _collectAttributes(self._element._c_node, 3) ))
  2236. def __copy__(self):
  2237. _assertValidNode(self._element)
  2238. return dict(_collectAttributes(self._element._c_node, 3))
  2239. def __deepcopy__(self, memo):
  2240. _assertValidNode(self._element)
  2241. return dict(_collectAttributes(self._element._c_node, 3))
  2242. def __getitem__(self, key):
  2243. _assertValidNode(self._element)
  2244. result = _getAttributeValue(self._element, key, None)
  2245. if result is None:
  2246. raise KeyError, key
  2247. return result
  2248. def __bool__(self):
  2249. _assertValidNode(self._element)
  2250. cdef xmlAttr* c_attr = self._element._c_node.properties
  2251. while c_attr is not NULL:
  2252. if c_attr.type == tree.XML_ATTRIBUTE_NODE:
  2253. return 1
  2254. c_attr = c_attr.next
  2255. return 0
  2256. def __len__(self):
  2257. _assertValidNode(self._element)
  2258. cdef xmlAttr* c_attr = self._element._c_node.properties
  2259. cdef Py_ssize_t c = 0
  2260. while c_attr is not NULL:
  2261. if c_attr.type == tree.XML_ATTRIBUTE_NODE:
  2262. c += 1
  2263. c_attr = c_attr.next
  2264. return c
  2265. def get(self, key, default=None):
  2266. _assertValidNode(self._element)
  2267. return _getAttributeValue(self._element, key, default)
  2268. def keys(self):
  2269. _assertValidNode(self._element)
  2270. return _collectAttributes(self._element._c_node, 1)
  2271. def __iter__(self):
  2272. _assertValidNode(self._element)
  2273. return iter(_collectAttributes(self._element._c_node, 1))
  2274. def iterkeys(self):
  2275. _assertValidNode(self._element)
  2276. return iter(_collectAttributes(self._element._c_node, 1))
  2277. def values(self):
  2278. _assertValidNode(self._element)
  2279. return _collectAttributes(self._element._c_node, 2)
  2280. def itervalues(self):
  2281. _assertValidNode(self._element)
  2282. return iter(_collectAttributes(self._element._c_node, 2))
  2283. def items(self):
  2284. _assertValidNode(self._element)
  2285. return _collectAttributes(self._element._c_node, 3)
  2286. def iteritems(self):
  2287. _assertValidNode(self._element)
  2288. return iter(_collectAttributes(self._element._c_node, 3))
  2289. def has_key(self, key):
  2290. _assertValidNode(self._element)
  2291. return key in self
  2292. def __contains__(self, key):
  2293. _assertValidNode(self._element)
  2294. cdef xmlNode* c_node
  2295. ns, tag = _getNsTag(key)
  2296. c_node = self._element._c_node
  2297. c_href = <const_xmlChar*>NULL if ns is None else _xcstr(ns)
  2298. return 1 if tree.xmlHasNsProp(c_node, _xcstr(tag), c_href) else 0
  2299. def __richcmp__(self, other, int op):
  2300. try:
  2301. one = dict(self.items())
  2302. if not isinstance(other, dict):
  2303. other = dict(other)
  2304. except (TypeError, ValueError):
  2305. return NotImplemented
  2306. return python.PyObject_RichCompare(one, other, op)
  2307. MutableMapping.register(_Attrib)
  2308. @cython.final
  2309. @cython.internal
  2310. cdef class _AttribIterator:
  2311. """Attribute iterator - for internal use only!
  2312. """
  2313. # XML attributes must not be removed while running!
  2314. cdef _Element _node
  2315. cdef xmlAttr* _c_attr
  2316. cdef int _keysvalues # 1 - keys, 2 - values, 3 - items (key, value)
  2317. def __iter__(self):
  2318. return self
  2319. def __next__(self):
  2320. cdef xmlAttr* c_attr
  2321. if self._node is None:
  2322. raise StopIteration
  2323. c_attr = self._c_attr
  2324. while c_attr is not NULL and c_attr.type != tree.XML_ATTRIBUTE_NODE:
  2325. c_attr = c_attr.next
  2326. if c_attr is NULL:
  2327. self._node = None
  2328. raise StopIteration
  2329. self._c_attr = c_attr.next
  2330. if self._keysvalues == 1:
  2331. return _namespacedName(<xmlNode*>c_attr)
  2332. elif self._keysvalues == 2:
  2333. return _attributeValue(self._node._c_node, c_attr)
  2334. else:
  2335. return (_namespacedName(<xmlNode*>c_attr),
  2336. _attributeValue(self._node._c_node, c_attr))
  2337. cdef object _attributeIteratorFactory(_Element element, int keysvalues):
  2338. cdef _AttribIterator attribs
  2339. if element._c_node.properties is NULL:
  2340. return ITER_EMPTY
  2341. attribs = _AttribIterator()
  2342. attribs._node = element
  2343. attribs._c_attr = element._c_node.properties
  2344. attribs._keysvalues = keysvalues
  2345. return attribs
  2346. cdef public class _ElementTagMatcher [ object LxmlElementTagMatcher,
  2347. type LxmlElementTagMatcherType ]:
  2348. """
  2349. Dead but public. :)
  2350. """
  2351. cdef object _pystrings
  2352. cdef int _node_type
  2353. cdef char* _href
  2354. cdef char* _name
  2355. cdef _initTagMatch(self, tag):
  2356. self._href = NULL
  2357. self._name = NULL
  2358. if tag is None:
  2359. self._node_type = 0
  2360. elif tag is Comment:
  2361. self._node_type = tree.XML_COMMENT_NODE
  2362. elif tag is ProcessingInstruction:
  2363. self._node_type = tree.XML_PI_NODE
  2364. elif tag is Entity:
  2365. self._node_type = tree.XML_ENTITY_REF_NODE
  2366. elif tag is Element:
  2367. self._node_type = tree.XML_ELEMENT_NODE
  2368. else:
  2369. self._node_type = tree.XML_ELEMENT_NODE
  2370. self._pystrings = _getNsTag(tag)
  2371. if self._pystrings[0] is not None:
  2372. self._href = _cstr(self._pystrings[0])
  2373. self._name = _cstr(self._pystrings[1])
  2374. if self._name[0] == c'*' and self._name[1] == c'\0':
  2375. self._name = NULL
  2376. cdef public class _ElementIterator(_ElementTagMatcher) [
  2377. object LxmlElementIterator, type LxmlElementIteratorType ]:
  2378. """
  2379. Dead but public. :)
  2380. """
  2381. # we keep Python references here to control GC
  2382. cdef _Element _node
  2383. cdef _node_to_node_function _next_element
  2384. def __iter__(self):
  2385. return self
  2386. cdef void _storeNext(self, _Element node):
  2387. cdef xmlNode* c_node
  2388. c_node = self._next_element(node._c_node)
  2389. while c_node is not NULL and \
  2390. self._node_type != 0 and \
  2391. (<tree.xmlElementType>self._node_type != c_node.type or
  2392. not _tagMatches(c_node, <const_xmlChar*>self._href, <const_xmlChar*>self._name)):
  2393. c_node = self._next_element(c_node)
  2394. if c_node is NULL:
  2395. self._node = None
  2396. else:
  2397. # Python ref:
  2398. self._node = _elementFactory(node._doc, c_node)
  2399. def __next__(self):
  2400. cdef xmlNode* c_node
  2401. cdef _Element current_node
  2402. if self._node is None:
  2403. raise StopIteration
  2404. # Python ref:
  2405. current_node = self._node
  2406. self._storeNext(current_node)
  2407. return current_node
  2408. @cython.final
  2409. @cython.internal
  2410. cdef class _MultiTagMatcher:
  2411. """
  2412. Match an xmlNode against a list of tags.
  2413. """
  2414. cdef list _py_tags
  2415. cdef qname* _cached_tags
  2416. cdef size_t _tag_count
  2417. cdef size_t _cached_size
  2418. cdef _Document _cached_doc
  2419. cdef int _node_types
  2420. def __cinit__(self, tags):
  2421. self._py_tags = []
  2422. self.initTagMatch(tags)
  2423. def __dealloc__(self):
  2424. self._clear()
  2425. cdef bint rejectsAll(self) noexcept:
  2426. return not self._tag_count and not self._node_types
  2427. cdef bint rejectsAllAttributes(self) noexcept:
  2428. return not self._tag_count
  2429. cdef bint matchesType(self, int node_type) noexcept:
  2430. if node_type == tree.XML_ELEMENT_NODE and self._tag_count:
  2431. return True
  2432. return self._node_types & (1 << node_type)
  2433. cdef void _clear(self) noexcept:
  2434. cdef size_t i, count
  2435. count = self._tag_count
  2436. self._tag_count = 0
  2437. if self._cached_tags:
  2438. for i in range(count):
  2439. cpython.ref.Py_XDECREF(self._cached_tags[i].href)
  2440. python.lxml_free(self._cached_tags)
  2441. self._cached_tags = NULL
  2442. cdef initTagMatch(self, tags):
  2443. self._cached_doc = None
  2444. del self._py_tags[:]
  2445. self._clear()
  2446. if tags is None or tags == ():
  2447. # no selection in tags argument => match anything
  2448. self._node_types = (
  2449. 1 << tree.XML_COMMENT_NODE |
  2450. 1 << tree.XML_PI_NODE |
  2451. 1 << tree.XML_ENTITY_REF_NODE |
  2452. 1 << tree.XML_ELEMENT_NODE)
  2453. else:
  2454. self._node_types = 0
  2455. self._storeTags(tags, set())
  2456. cdef _storeTags(self, tag, set seen):
  2457. if tag is Comment:
  2458. self._node_types |= 1 << tree.XML_COMMENT_NODE
  2459. elif tag is ProcessingInstruction:
  2460. self._node_types |= 1 << tree.XML_PI_NODE
  2461. elif tag is Entity:
  2462. self._node_types |= 1 << tree.XML_ENTITY_REF_NODE
  2463. elif tag is Element:
  2464. self._node_types |= 1 << tree.XML_ELEMENT_NODE
  2465. elif python._isString(tag):
  2466. if tag in seen:
  2467. return
  2468. seen.add(tag)
  2469. if tag in ('*', '{*}*'):
  2470. self._node_types |= 1 << tree.XML_ELEMENT_NODE
  2471. else:
  2472. href, name = _getNsTag(tag)
  2473. if name == b'*':
  2474. name = None
  2475. if href is None:
  2476. href = b'' # no namespace
  2477. elif href == b'*':
  2478. href = None # wildcard: any namespace, including none
  2479. self._py_tags.append((href, name))
  2480. elif isinstance(tag, QName):
  2481. self._storeTags(tag.text, seen)
  2482. else:
  2483. # support a sequence of tags
  2484. for item in tag:
  2485. self._storeTags(item, seen)
  2486. cdef inline int cacheTags(self, _Document doc, bint force_into_dict=False) except -1:
  2487. """
  2488. Look up the tag names in the doc dict to enable string pointer comparisons.
  2489. """
  2490. cdef size_t dict_size = tree.xmlDictSize(doc._c_doc.dict)
  2491. if doc is self._cached_doc and dict_size == self._cached_size:
  2492. # doc and dict didn't change => names already cached
  2493. return 0
  2494. self._tag_count = 0
  2495. if not self._py_tags:
  2496. self._cached_doc = doc
  2497. self._cached_size = dict_size
  2498. return 0
  2499. if not self._cached_tags:
  2500. self._cached_tags = <qname*>python.lxml_malloc(len(self._py_tags), sizeof(qname))
  2501. if not self._cached_tags:
  2502. self._cached_doc = None
  2503. raise MemoryError()
  2504. self._tag_count = <size_t>_mapTagsToQnameMatchArray(
  2505. doc._c_doc, self._py_tags, self._cached_tags, force_into_dict)
  2506. self._cached_doc = doc
  2507. self._cached_size = dict_size
  2508. return 0
  2509. cdef inline bint matches(self, xmlNode* c_node) noexcept:
  2510. cdef qname* c_qname
  2511. if self._node_types & (1 << c_node.type):
  2512. return True
  2513. elif c_node.type == tree.XML_ELEMENT_NODE:
  2514. for c_qname in self._cached_tags[:self._tag_count]:
  2515. if _tagMatchesExactly(c_node, c_qname):
  2516. return True
  2517. return False
  2518. cdef inline bint matchesNsTag(self, const_xmlChar* c_href,
  2519. const_xmlChar* c_name) noexcept:
  2520. cdef qname* c_qname
  2521. if self._node_types & (1 << tree.XML_ELEMENT_NODE):
  2522. return True
  2523. for c_qname in self._cached_tags[:self._tag_count]:
  2524. if _nsTagMatchesExactly(c_href, c_name, c_qname):
  2525. return True
  2526. return False
  2527. cdef inline bint matchesAttribute(self, xmlAttr* c_attr) noexcept:
  2528. """Attribute matches differ from Element matches in that they do
  2529. not care about node types.
  2530. """
  2531. cdef qname* c_qname
  2532. for c_qname in self._cached_tags[:self._tag_count]:
  2533. if _tagMatchesExactly(<xmlNode*>c_attr, c_qname):
  2534. return True
  2535. return False
  2536. cdef class _ElementMatchIterator:
  2537. cdef _Element _node
  2538. cdef _node_to_node_function _next_element
  2539. cdef _MultiTagMatcher _matcher
  2540. @cython.final
  2541. cdef _initTagMatcher(self, tags):
  2542. self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tags)
  2543. def __iter__(self):
  2544. return self
  2545. @cython.final
  2546. cdef int _storeNext(self, _Element node) except -1:
  2547. self._matcher.cacheTags(node._doc)
  2548. c_node = self._next_element(node._c_node)
  2549. while c_node is not NULL and not self._matcher.matches(c_node):
  2550. c_node = self._next_element(c_node)
  2551. # store Python ref to next node to make sure it's kept alive
  2552. self._node = _elementFactory(node._doc, c_node) if c_node is not NULL else None
  2553. return 0
  2554. def __next__(self):
  2555. cdef _Element current_node = self._node
  2556. if current_node is None:
  2557. raise StopIteration
  2558. self._storeNext(current_node)
  2559. return current_node
  2560. cdef class ElementChildIterator(_ElementMatchIterator):
  2561. """ElementChildIterator(self, node, tag=None, reversed=False)
  2562. Iterates over the children of an element.
  2563. """
  2564. def __cinit__(self, _Element node not None, tag=None, *, bint reversed=False):
  2565. cdef xmlNode* c_node
  2566. _assertValidNode(node)
  2567. self._initTagMatcher(tag)
  2568. if reversed:
  2569. c_node = _findChildBackwards(node._c_node, 0)
  2570. self._next_element = _previousElement
  2571. else:
  2572. c_node = _findChildForwards(node._c_node, 0)
  2573. self._next_element = _nextElement
  2574. self._matcher.cacheTags(node._doc)
  2575. while c_node is not NULL and not self._matcher.matches(c_node):
  2576. c_node = self._next_element(c_node)
  2577. # store Python ref to next node to make sure it's kept alive
  2578. self._node = _elementFactory(node._doc, c_node) if c_node is not NULL else None
  2579. cdef class SiblingsIterator(_ElementMatchIterator):
  2580. """SiblingsIterator(self, node, tag=None, preceding=False)
  2581. Iterates over the siblings of an element.
  2582. You can pass the boolean keyword ``preceding`` to specify the direction.
  2583. """
  2584. def __cinit__(self, _Element node not None, tag=None, *, bint preceding=False):
  2585. _assertValidNode(node)
  2586. self._initTagMatcher(tag)
  2587. if preceding:
  2588. self._next_element = _previousElement
  2589. else:
  2590. self._next_element = _nextElement
  2591. self._storeNext(node)
  2592. cdef class AncestorsIterator(_ElementMatchIterator):
  2593. """AncestorsIterator(self, node, tag=None)
  2594. Iterates over the ancestors of an element (from parent to parent).
  2595. """
  2596. def __cinit__(self, _Element node not None, tag=None):
  2597. _assertValidNode(node)
  2598. self._initTagMatcher(tag)
  2599. self._next_element = _parentElement
  2600. self._storeNext(node)
  2601. cdef class ElementDepthFirstIterator:
  2602. """ElementDepthFirstIterator(self, node, tag=None, inclusive=True)
  2603. Iterates over an element and its sub-elements in document order (depth
  2604. first pre-order).
  2605. Note that this also includes comments, entities and processing
  2606. instructions. To filter them out, check if the ``tag`` property
  2607. of the returned element is a string (i.e. not None and not a
  2608. factory function), or pass the ``Element`` factory for the ``tag``
  2609. argument to receive only Elements.
  2610. If the optional ``tag`` argument is not None, the iterator returns only
  2611. the elements that match the respective name and namespace.
  2612. The optional boolean argument 'inclusive' defaults to True and can be set
  2613. to False to exclude the start element itself.
  2614. Note that the behaviour of this iterator is completely undefined if the
  2615. tree it traverses is modified during iteration.
  2616. """
  2617. # we keep Python references here to control GC
  2618. # keep the next Element after the one we return, and the (s)top node
  2619. cdef _Element _next_node
  2620. cdef _Element _top_node
  2621. cdef _MultiTagMatcher _matcher
  2622. def __cinit__(self, _Element node not None, tag=None, *, bint inclusive=True):
  2623. _assertValidNode(node)
  2624. self._top_node = node
  2625. self._next_node = node
  2626. self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag)
  2627. self._matcher.cacheTags(node._doc)
  2628. if not inclusive or not self._matcher.matches(node._c_node):
  2629. # find start node (this cannot raise StopIteration, self._next_node != None)
  2630. next(self)
  2631. def __iter__(self):
  2632. return self
  2633. def __next__(self):
  2634. cdef xmlNode* c_node
  2635. cdef _Element current_node = self._next_node
  2636. if current_node is None:
  2637. raise StopIteration
  2638. c_node = current_node._c_node
  2639. self._matcher.cacheTags(current_node._doc)
  2640. if not self._matcher._tag_count:
  2641. # no tag name was found in the dict => not in document either
  2642. # try to match by node type
  2643. c_node = self._nextNodeAnyTag(c_node)
  2644. else:
  2645. c_node = self._nextNodeMatchTag(c_node)
  2646. if c_node is NULL:
  2647. self._next_node = None
  2648. else:
  2649. self._next_node = _elementFactory(current_node._doc, c_node)
  2650. return current_node
  2651. @cython.final
  2652. cdef xmlNode* _nextNodeAnyTag(self, xmlNode* c_node) noexcept:
  2653. cdef int node_types = self._matcher._node_types
  2654. if not node_types:
  2655. return NULL
  2656. tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0)
  2657. if node_types & (1 << c_node.type):
  2658. return c_node
  2659. tree.END_FOR_EACH_ELEMENT_FROM(c_node)
  2660. return NULL
  2661. @cython.final
  2662. cdef xmlNode* _nextNodeMatchTag(self, xmlNode* c_node) noexcept:
  2663. tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0)
  2664. if self._matcher.matches(c_node):
  2665. return c_node
  2666. tree.END_FOR_EACH_ELEMENT_FROM(c_node)
  2667. return NULL
  2668. cdef class ElementTextIterator:
  2669. """ElementTextIterator(self, element, tag=None, with_tail=True)
  2670. Iterates over the text content of a subtree.
  2671. You can pass the ``tag`` keyword argument to restrict text content to a
  2672. specific tag name.
  2673. You can set the ``with_tail`` keyword argument to ``False`` to skip over
  2674. tail text (e.g. if you know that it's only whitespace from pretty-printing).
  2675. """
  2676. cdef object _events
  2677. cdef _Element _start_element
  2678. def __cinit__(self, _Element element not None, tag=None, *, bint with_tail=True):
  2679. _assertValidNode(element)
  2680. if with_tail:
  2681. events = ("start", "comment", "pi", "end")
  2682. else:
  2683. events = ("start",)
  2684. self._start_element = element
  2685. self._events = iterwalk(element, events=events, tag=tag)
  2686. def __iter__(self):
  2687. return self
  2688. def __next__(self):
  2689. cdef _Element element
  2690. result = None
  2691. while result is None:
  2692. event, element = next(self._events) # raises StopIteration
  2693. if event == "start":
  2694. result = element.text
  2695. elif element is not self._start_element:
  2696. result = element.tail
  2697. return result
  2698. cdef xmlNode* _createElement(xmlDoc* c_doc, object name_utf) except NULL:
  2699. cdef xmlNode* c_node
  2700. c_node = tree.xmlNewDocNode(c_doc, NULL, _xcstr(name_utf), NULL)
  2701. return c_node
  2702. cdef xmlNode* _createComment(xmlDoc* c_doc, const_xmlChar* text) noexcept:
  2703. cdef xmlNode* c_node
  2704. c_node = tree.xmlNewDocComment(c_doc, text)
  2705. return c_node
  2706. cdef xmlNode* _createPI(xmlDoc* c_doc, const_xmlChar* target, const_xmlChar* text) noexcept:
  2707. cdef xmlNode* c_node
  2708. c_node = tree.xmlNewDocPI(c_doc, target, text)
  2709. return c_node
  2710. cdef xmlNode* _createEntity(xmlDoc* c_doc, const_xmlChar* name) noexcept:
  2711. cdef xmlNode* c_node
  2712. c_node = tree.xmlNewReference(c_doc, name)
  2713. return c_node
  2714. # module-level API for ElementTree
  2715. from abc import ABC
  2716. class Element(ABC):
  2717. """Element(_tag, attrib=None, nsmap=None, **_extra)
  2718. Element factory, as a class.
  2719. An instance of this class is an object implementing the
  2720. Element interface.
  2721. >>> element = Element("test")
  2722. >>> type(element)
  2723. <class 'lxml.etree._Element'>
  2724. >>> isinstance(element, Element)
  2725. True
  2726. >>> issubclass(_Element, Element)
  2727. True
  2728. Also look at the `_Element.makeelement()` and
  2729. `_BaseParser.makeelement()` methods, which provide a faster way to
  2730. create an Element within a specific document or parser context.
  2731. """
  2732. def __new__(cls, _tag, attrib=None, nsmap=None, **_extra):
  2733. return _makeElement(_tag, NULL, None, None, None, None,
  2734. attrib, nsmap, _extra)
  2735. # Register _Element as a virtual subclass of Element
  2736. Element.register(_Element)
  2737. def Comment(text=None):
  2738. """Comment(text=None)
  2739. Comment element factory. This factory function creates a special element that will
  2740. be serialized as an XML comment.
  2741. """
  2742. cdef _Document doc
  2743. cdef xmlNode* c_node
  2744. cdef xmlDoc* c_doc
  2745. if text is None:
  2746. text = b''
  2747. else:
  2748. text = _utf8(text)
  2749. if b'--' in text or text.endswith(b'-'):
  2750. raise ValueError("Comment may not contain '--' or end with '-'")
  2751. c_doc = _newXMLDoc()
  2752. doc = _documentFactory(c_doc, None)
  2753. c_node = _createComment(c_doc, _xcstr(text))
  2754. tree.xmlAddChild(<xmlNode*>c_doc, c_node)
  2755. return _elementFactory(doc, c_node)
  2756. def ProcessingInstruction(target, text=None):
  2757. """ProcessingInstruction(target, text=None)
  2758. ProcessingInstruction element factory. This factory function creates a
  2759. special element that will be serialized as an XML processing instruction.
  2760. """
  2761. cdef _Document doc
  2762. cdef xmlNode* c_node
  2763. cdef xmlDoc* c_doc
  2764. target = _utf8(target)
  2765. _tagValidOrRaise(target)
  2766. if target.lower() == b'xml':
  2767. raise ValueError, f"Invalid PI name '{target}'"
  2768. if text is None:
  2769. text = b''
  2770. else:
  2771. text = _utf8(text)
  2772. if b'?>' in text:
  2773. raise ValueError, "PI text must not contain '?>'"
  2774. c_doc = _newXMLDoc()
  2775. doc = _documentFactory(c_doc, None)
  2776. c_node = _createPI(c_doc, _xcstr(target), _xcstr(text))
  2777. tree.xmlAddChild(<xmlNode*>c_doc, c_node)
  2778. return _elementFactory(doc, c_node)
  2779. PI = ProcessingInstruction
  2780. cdef class CDATA:
  2781. """CDATA(data)
  2782. CDATA factory. This factory creates an opaque data object that
  2783. can be used to set Element text. The usual way to use it is::
  2784. >>> el = Element('content')
  2785. >>> el.text = CDATA('a string')
  2786. >>> print(el.text)
  2787. a string
  2788. >>> print(tostring(el, encoding="unicode"))
  2789. <content><![CDATA[a string]]></content>
  2790. """
  2791. cdef bytes _utf8_data
  2792. def __cinit__(self, data):
  2793. self._utf8_data = _utf8(data)
  2794. def Entity(name):
  2795. """Entity(name)
  2796. Entity factory. This factory function creates a special element
  2797. that will be serialized as an XML entity reference or character
  2798. reference. Note, however, that entities will not be automatically
  2799. declared in the document. A document that uses entity references
  2800. requires a DTD to define the entities.
  2801. """
  2802. cdef _Document doc
  2803. cdef xmlNode* c_node
  2804. cdef xmlDoc* c_doc
  2805. name_utf = _utf8(name)
  2806. c_name = _xcstr(name_utf)
  2807. if c_name[0] == c'#':
  2808. if not _characterReferenceIsValid(c_name + 1):
  2809. raise ValueError, f"Invalid character reference: '{name}'"
  2810. elif not _xmlNameIsValid(c_name):
  2811. raise ValueError, f"Invalid entity reference: '{name}'"
  2812. c_doc = _newXMLDoc()
  2813. doc = _documentFactory(c_doc, None)
  2814. c_node = _createEntity(c_doc, c_name)
  2815. tree.xmlAddChild(<xmlNode*>c_doc, c_node)
  2816. return _elementFactory(doc, c_node)
  2817. def SubElement(_Element _parent not None, _tag,
  2818. attrib=None, nsmap=None, **_extra):
  2819. """SubElement(_parent, _tag, attrib=None, nsmap=None, **_extra)
  2820. Subelement factory. This function creates an element instance, and
  2821. appends it to an existing element.
  2822. """
  2823. return _makeSubElement(_parent, _tag, None, None, attrib, nsmap, _extra)
  2824. from typing import Generic, TypeVar
  2825. T = TypeVar("T")
  2826. class ElementTree(ABC, Generic[T]):
  2827. def __new__(cls, _Element element=None, *, file=None, _BaseParser parser=None):
  2828. """ElementTree(element=None, file=None, parser=None)
  2829. ElementTree wrapper class.
  2830. """
  2831. cdef xmlNode* c_next
  2832. cdef xmlNode* c_node
  2833. cdef xmlNode* c_node_copy
  2834. cdef xmlDoc* c_doc
  2835. cdef _ElementTree etree
  2836. cdef _Document doc
  2837. if element is not None:
  2838. doc = element._doc
  2839. elif file is not None:
  2840. try:
  2841. doc = _parseDocument(file, parser, None)
  2842. except _TargetParserResult as result_container:
  2843. return result_container.result
  2844. else:
  2845. c_doc = _newXMLDoc()
  2846. doc = _documentFactory(c_doc, parser)
  2847. return _elementTreeFactory(doc, element)
  2848. # Register _ElementTree as a virtual subclass of ElementTree
  2849. ElementTree.register(_ElementTree)
  2850. # Remove "ABC" and typing helpers from module dict
  2851. del ABC, Generic, TypeVar, T
  2852. def HTML(text, _BaseParser parser=None, *, base_url=None):
  2853. """HTML(text, parser=None, base_url=None)
  2854. Parses an HTML document from a string constant. Returns the root
  2855. node (or the result returned by a parser target). This function
  2856. can be used to embed "HTML literals" in Python code.
  2857. To override the parser with a different ``HTMLParser`` you can pass it to
  2858. the ``parser`` keyword argument.
  2859. The ``base_url`` keyword argument allows to set the original base URL of
  2860. the document to support relative Paths when looking up external entities
  2861. (DTD, XInclude, ...).
  2862. """
  2863. cdef _Document doc
  2864. if parser is None:
  2865. parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
  2866. if not isinstance(parser, HTMLParser):
  2867. parser = __DEFAULT_HTML_PARSER
  2868. try:
  2869. doc = _parseMemoryDocument(text, base_url, parser)
  2870. return doc.getroot()
  2871. except _TargetParserResult as result_container:
  2872. return result_container.result
  2873. def XML(text, _BaseParser parser=None, *, base_url=None):
  2874. """XML(text, parser=None, base_url=None)
  2875. Parses an XML document or fragment from a string constant.
  2876. Returns the root node (or the result returned by a parser target).
  2877. This function can be used to embed "XML literals" in Python code,
  2878. like in
  2879. >>> root = XML("<root><test/></root>")
  2880. >>> print(root.tag)
  2881. root
  2882. To override the parser with a different ``XMLParser`` you can pass it to
  2883. the ``parser`` keyword argument.
  2884. The ``base_url`` keyword argument allows to set the original base URL of
  2885. the document to support relative Paths when looking up external entities
  2886. (DTD, XInclude, ...).
  2887. """
  2888. cdef _Document doc
  2889. if parser is None:
  2890. parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
  2891. if not isinstance(parser, XMLParser):
  2892. parser = __DEFAULT_XML_PARSER
  2893. try:
  2894. doc = _parseMemoryDocument(text, base_url, parser)
  2895. return doc.getroot()
  2896. except _TargetParserResult as result_container:
  2897. return result_container.result
  2898. def fromstring(text, _BaseParser parser=None, *, base_url=None):
  2899. """fromstring(text, parser=None, base_url=None)
  2900. Parses an XML document or fragment from a string. Returns the
  2901. root node (or the result returned by a parser target).
  2902. To override the default parser with a different parser you can pass it to
  2903. the ``parser`` keyword argument.
  2904. The ``base_url`` keyword argument allows to set the original base URL of
  2905. the document to support relative Paths when looking up external entities
  2906. (DTD, XInclude, ...).
  2907. """
  2908. cdef _Document doc
  2909. try:
  2910. doc = _parseMemoryDocument(text, base_url, parser)
  2911. return doc.getroot()
  2912. except _TargetParserResult as result_container:
  2913. return result_container.result
  2914. def fromstringlist(strings, _BaseParser parser=None):
  2915. """fromstringlist(strings, parser=None)
  2916. Parses an XML document from a sequence of strings. Returns the
  2917. root node (or the result returned by a parser target).
  2918. To override the default parser with a different parser you can pass it to
  2919. the ``parser`` keyword argument.
  2920. """
  2921. cdef _Document doc
  2922. if isinstance(strings, (bytes, unicode)):
  2923. raise ValueError("passing a single string into fromstringlist() is not"
  2924. " efficient, use fromstring() instead")
  2925. if parser is None:
  2926. parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
  2927. feed = parser.feed
  2928. for data in strings:
  2929. feed(data)
  2930. return parser.close()
  2931. def iselement(element):
  2932. """iselement(element)
  2933. Checks if an object appears to be a valid element object.
  2934. """
  2935. return isinstance(element, _Element) and (<_Element>element)._c_node is not NULL
  2936. def indent(tree, space=" ", *, Py_ssize_t level=0):
  2937. """indent(tree, space=" ", level=0)
  2938. Indent an XML document by inserting newlines and indentation space
  2939. after elements.
  2940. *tree* is the ElementTree or Element to modify. The (root) element
  2941. itself will not be changed, but the tail text of all elements in its
  2942. subtree will be adapted.
  2943. *space* is the whitespace to insert for each indentation level, two
  2944. space characters by default.
  2945. *level* is the initial indentation level. Setting this to a higher
  2946. value than 0 can be used for indenting subtrees that are more deeply
  2947. nested inside of a document.
  2948. """
  2949. root = _rootNodeOrRaise(tree)
  2950. if level < 0:
  2951. raise ValueError(f"Initial indentation level must be >= 0, got {level}")
  2952. if _hasChild(root._c_node):
  2953. space = _utf8(space)
  2954. indent = b"\n" + level * space
  2955. _indent_children(root._c_node, 1, space, [indent, indent + space])
  2956. cdef int _indent_children(xmlNode* c_node, Py_ssize_t level, bytes one_space, list indentations) except -1:
  2957. # Reuse indentation strings for speed.
  2958. if len(indentations) <= level:
  2959. indentations.append(indentations[-1] + one_space)
  2960. # Start a new indentation level for the first child.
  2961. child_indentation = indentations[level]
  2962. if not _hasNonWhitespaceText(c_node):
  2963. _setNodeText(c_node, child_indentation)
  2964. # Recursively indent all children.
  2965. cdef xmlNode* c_child = _findChildForwards(c_node, 0)
  2966. while c_child is not NULL:
  2967. if _hasChild(c_child):
  2968. _indent_children(c_child, level+1, one_space, indentations)
  2969. c_next_child = _nextElement(c_child)
  2970. if not _hasNonWhitespaceTail(c_child):
  2971. if c_next_child is NULL:
  2972. # Dedent after the last child.
  2973. child_indentation = indentations[level-1]
  2974. _setTailText(c_child, child_indentation)
  2975. c_child = c_next_child
  2976. return 0
  2977. def dump(_Element elem not None, *, bint pretty_print=True, bint with_tail=True):
  2978. """dump(elem, pretty_print=True, with_tail=True)
  2979. Writes an element tree or element structure to sys.stdout. This function
  2980. should be used for debugging only.
  2981. """
  2982. xml = tostring(elem, pretty_print=pretty_print, with_tail=with_tail, encoding='unicode')
  2983. if not pretty_print:
  2984. xml += '\n'
  2985. sys.stdout.write(xml)
  2986. def tostring(element_or_tree, *, encoding=None, method="xml",
  2987. xml_declaration=None, bint pretty_print=False, bint with_tail=True,
  2988. standalone=None, doctype=None,
  2989. # method='c14n'
  2990. bint exclusive=False, inclusive_ns_prefixes=None,
  2991. # method='c14n2'
  2992. bint with_comments=True, bint strip_text=False,
  2993. ):
  2994. """tostring(element_or_tree, encoding=None, method="xml",
  2995. xml_declaration=None, pretty_print=False, with_tail=True,
  2996. standalone=None, doctype=None,
  2997. exclusive=False, inclusive_ns_prefixes=None,
  2998. with_comments=True, strip_text=False,
  2999. )
  3000. Serialize an element to an encoded string representation of its XML
  3001. tree.
  3002. Defaults to ASCII encoding without XML declaration. This
  3003. behaviour can be configured with the keyword arguments 'encoding'
  3004. (string) and 'xml_declaration' (bool). Note that changing the
  3005. encoding to a non UTF-8 compatible encoding will enable a
  3006. declaration by default.
  3007. You can also serialise to a Unicode string without declaration by
  3008. passing the name ``'unicode'`` as encoding (or the ``str`` function
  3009. in Py3 or ``unicode`` in Py2). This changes the return value from
  3010. a byte string to an unencoded unicode string.
  3011. The keyword argument 'pretty_print' (bool) enables formatted XML.
  3012. The keyword argument 'method' selects the output method: 'xml',
  3013. 'html', plain 'text' (text content without tags), 'c14n' or 'c14n2'.
  3014. Default is 'xml'.
  3015. With ``method="c14n"`` (C14N version 1), the options ``exclusive``,
  3016. ``with_comments`` and ``inclusive_ns_prefixes`` request exclusive
  3017. C14N, include comments, and list the inclusive prefixes respectively.
  3018. With ``method="c14n2"`` (C14N version 2), the ``with_comments`` and
  3019. ``strip_text`` options control the output of comments and text space
  3020. according to C14N 2.0.
  3021. Passing a boolean value to the ``standalone`` option will output
  3022. an XML declaration with the corresponding ``standalone`` flag.
  3023. The ``doctype`` option allows passing in a plain string that will
  3024. be serialised before the XML tree. Note that passing in non
  3025. well-formed content here will make the XML output non well-formed.
  3026. Also, an existing doctype in the document tree will not be removed
  3027. when serialising an ElementTree instance.
  3028. You can prevent the tail text of the element from being serialised
  3029. by passing the boolean ``with_tail`` option. This has no impact
  3030. on the tail text of children, which will always be serialised.
  3031. """
  3032. cdef bint write_declaration
  3033. cdef int is_standalone
  3034. # C14N serialisation
  3035. if method in ('c14n', 'c14n2'):
  3036. if encoding is not None:
  3037. raise ValueError("Cannot specify encoding with C14N")
  3038. if xml_declaration:
  3039. raise ValueError("Cannot enable XML declaration in C14N")
  3040. if method == 'c14n':
  3041. return _tostringC14N(element_or_tree, exclusive, with_comments, inclusive_ns_prefixes)
  3042. else:
  3043. out = BytesIO()
  3044. target = C14NWriterTarget(
  3045. utf8_writer(out).write,
  3046. with_comments=with_comments, strip_text=strip_text)
  3047. _tree_to_target(element_or_tree, target)
  3048. return out.getvalue()
  3049. if not with_comments:
  3050. raise ValueError("Can only discard comments in C14N serialisation")
  3051. if strip_text:
  3052. raise ValueError("Can only strip text in C14N 2.0 serialisation")
  3053. if encoding is unicode or (encoding is not None and encoding.lower() == 'unicode'):
  3054. if xml_declaration:
  3055. raise ValueError, \
  3056. "Serialisation to unicode must not request an XML declaration"
  3057. write_declaration = 0
  3058. encoding = unicode
  3059. elif xml_declaration is None:
  3060. # by default, write an XML declaration only for non-standard encodings
  3061. write_declaration = encoding is not None and encoding.upper() not in \
  3062. ('ASCII', 'UTF-8', 'UTF8', 'US-ASCII')
  3063. else:
  3064. write_declaration = xml_declaration
  3065. if encoding is None:
  3066. encoding = 'ASCII'
  3067. if standalone is None:
  3068. is_standalone = -1
  3069. elif standalone:
  3070. write_declaration = 1
  3071. is_standalone = 1
  3072. else:
  3073. write_declaration = 1
  3074. is_standalone = 0
  3075. if isinstance(element_or_tree, _Element):
  3076. return _tostring(<_Element>element_or_tree, encoding, doctype, method,
  3077. write_declaration, 0, pretty_print, with_tail,
  3078. is_standalone)
  3079. elif isinstance(element_or_tree, _ElementTree):
  3080. return _tostring((<_ElementTree>element_or_tree)._context_node,
  3081. encoding, doctype, method, write_declaration, 1,
  3082. pretty_print, with_tail, is_standalone)
  3083. else:
  3084. raise TypeError, f"Type '{python._fqtypename(element_or_tree).decode('utf8')}' cannot be serialized."
  3085. def tostringlist(element_or_tree, *args, **kwargs):
  3086. """tostringlist(element_or_tree, *args, **kwargs)
  3087. Serialize an element to an encoded string representation of its XML
  3088. tree, stored in a list of partial strings.
  3089. This is purely for ElementTree 1.3 compatibility. The result is a
  3090. single string wrapped in a list.
  3091. """
  3092. return [tostring(element_or_tree, *args, **kwargs)]
  3093. def tounicode(element_or_tree, *, method="xml", bint pretty_print=False,
  3094. bint with_tail=True, doctype=None):
  3095. """tounicode(element_or_tree, method="xml", pretty_print=False,
  3096. with_tail=True, doctype=None)
  3097. Serialize an element to the Python unicode representation of its XML
  3098. tree.
  3099. :deprecated: use ``tostring(el, encoding='unicode')`` instead.
  3100. Note that the result does not carry an XML encoding declaration and is
  3101. therefore not necessarily suited for serialization to byte streams without
  3102. further treatment.
  3103. The boolean keyword argument 'pretty_print' enables formatted XML.
  3104. The keyword argument 'method' selects the output method: 'xml',
  3105. 'html' or plain 'text'.
  3106. You can prevent the tail text of the element from being serialised
  3107. by passing the boolean ``with_tail`` option. This has no impact
  3108. on the tail text of children, which will always be serialised.
  3109. """
  3110. if isinstance(element_or_tree, _Element):
  3111. return _tostring(<_Element>element_or_tree, unicode, doctype, method,
  3112. 0, 0, pretty_print, with_tail, -1)
  3113. elif isinstance(element_or_tree, _ElementTree):
  3114. return _tostring((<_ElementTree>element_or_tree)._context_node,
  3115. unicode, doctype, method, 0, 1, pretty_print,
  3116. with_tail, -1)
  3117. else:
  3118. raise TypeError, f"Type '{type(element_or_tree)}' cannot be serialized."
  3119. def parse(source, _BaseParser parser=None, *, base_url=None):
  3120. """parse(source, parser=None, base_url=None)
  3121. Return an ElementTree object loaded with source elements. If no parser
  3122. is provided as second argument, the default parser is used.
  3123. The ``source`` can be any of the following:
  3124. - a file name/path
  3125. - a file object
  3126. - a file-like object
  3127. - a URL using the HTTP or FTP protocol
  3128. To parse from a string, use the ``fromstring()`` function instead.
  3129. Note that it is generally faster to parse from a file path or URL
  3130. than from an open file object or file-like object. Transparent
  3131. decompression from gzip compressed sources is supported (unless
  3132. explicitly disabled in libxml2).
  3133. The ``base_url`` keyword allows setting a URL for the document
  3134. when parsing from a file-like object. This is needed when looking
  3135. up external entities (DTD, XInclude, ...) with relative paths.
  3136. """
  3137. cdef _Document doc
  3138. try:
  3139. doc = _parseDocument(source, parser, base_url)
  3140. return _elementTreeFactory(doc, None)
  3141. except _TargetParserResult as result_container:
  3142. return result_container.result
  3143. def adopt_external_document(capsule, _BaseParser parser=None):
  3144. """adopt_external_document(capsule, parser=None)
  3145. Unpack a libxml2 document pointer from a PyCapsule and wrap it in an
  3146. lxml ElementTree object.
  3147. This allows external libraries to build XML/HTML trees using libxml2
  3148. and then pass them efficiently into lxml for further processing.
  3149. If a ``parser`` is provided, it will be used for configuring the
  3150. lxml document. No parsing will be done.
  3151. The capsule must have the name ``"libxml2:xmlDoc"`` and its pointer
  3152. value must reference a correct libxml2 document of type ``xmlDoc*``.
  3153. The creator of the capsule must take care to correctly clean up the
  3154. document using an appropriate capsule destructor. By default, the
  3155. libxml2 document will be copied to let lxml safely own the memory
  3156. of the internal tree that it uses.
  3157. If the capsule context is non-NULL, it must point to a C string that
  3158. can be compared using ``strcmp()``. If the context string equals
  3159. ``"destructor:xmlFreeDoc"``, the libxml2 document will not be copied
  3160. but the capsule invalidated instead by clearing its destructor and
  3161. name. That way, lxml takes ownership of the libxml2 document in memory
  3162. without creating a copy first, and the capsule destructor will not be
  3163. called. The document will then eventually be cleaned up by lxml using
  3164. the libxml2 API function ``xmlFreeDoc()`` once it is no longer used.
  3165. If no copy is made, later modifications of the tree outside of lxml
  3166. should not be attempted after transferring the ownership.
  3167. """
  3168. cdef xmlDoc* c_doc
  3169. cdef bint is_owned = False
  3170. c_doc = <xmlDoc*> python.lxml_unpack_xmldoc_capsule(capsule, &is_owned)
  3171. doc = _adoptForeignDoc(c_doc, parser, is_owned)
  3172. return _elementTreeFactory(doc, None)
  3173. ################################################################################
  3174. # Include submodules
  3175. include "readonlytree.pxi" # Read-only implementation of Element proxies
  3176. include "classlookup.pxi" # Element class lookup mechanisms
  3177. include "nsclasses.pxi" # Namespace implementation and registry
  3178. include "docloader.pxi" # Support for custom document loaders
  3179. include "parser.pxi" # XML and HTML parsers
  3180. include "saxparser.pxi" # SAX-like Parser interface and tree builder
  3181. include "parsertarget.pxi" # ET Parser target
  3182. include "serializer.pxi" # XML output functions
  3183. include "iterparse.pxi" # incremental XML parsing
  3184. include "xmlid.pxi" # XMLID and IDDict
  3185. include "xinclude.pxi" # XInclude
  3186. include "cleanup.pxi" # Cleanup and recursive element removal functions
  3187. ################################################################################
  3188. # Include submodules for XPath and XSLT
  3189. include "extensions.pxi" # XPath/XSLT extension functions
  3190. include "xpath.pxi" # XPath evaluation
  3191. include "xslt.pxi" # XSL transformations
  3192. include "xsltext.pxi" # XSL extension elements
  3193. ################################################################################
  3194. # Validation
  3195. cdef class DocumentInvalid(LxmlError):
  3196. """Validation error.
  3197. Raised by all document validators when their ``assertValid(tree)``
  3198. method fails.
  3199. """
  3200. cdef class _Validator:
  3201. "Base class for XML validators."
  3202. cdef _ErrorLog _error_log
  3203. def __cinit__(self):
  3204. self._error_log = _ErrorLog()
  3205. def validate(self, etree):
  3206. """validate(self, etree)
  3207. Validate the document using this schema.
  3208. Returns true if document is valid, false if not.
  3209. """
  3210. return self(etree)
  3211. def assertValid(self, etree):
  3212. """assertValid(self, etree)
  3213. Raises `DocumentInvalid` if the document does not comply with the schema.
  3214. """
  3215. if not self(etree):
  3216. raise DocumentInvalid(self._error_log._buildExceptionMessage(
  3217. "Document does not comply with schema"),
  3218. self._error_log)
  3219. def assert_(self, etree):
  3220. """assert_(self, etree)
  3221. Raises `AssertionError` if the document does not comply with the schema.
  3222. """
  3223. if not self(etree):
  3224. raise AssertionError, self._error_log._buildExceptionMessage(
  3225. "Document does not comply with schema")
  3226. cpdef _append_log_message(self, int domain, int type, int level, int line,
  3227. message, filename):
  3228. self._error_log._receiveGeneric(domain, type, level, line, message,
  3229. filename)
  3230. cpdef _clear_error_log(self):
  3231. self._error_log.clear()
  3232. @property
  3233. def error_log(self):
  3234. """The log of validation errors and warnings."""
  3235. assert self._error_log is not None, "XPath evaluator not initialised"
  3236. return self._error_log.copy()
  3237. include "dtd.pxi" # DTD
  3238. include "relaxng.pxi" # RelaxNG
  3239. include "xmlschema.pxi" # XMLSchema
  3240. include "schematron.pxi" # Schematron (requires libxml2 2.6.21+)
  3241. ################################################################################
  3242. # Public C API
  3243. include "public-api.pxi"
  3244. ################################################################################
  3245. # Other stuff
  3246. include "debug.pxi"