Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 
 

1850 lignes
66 KiB

  1. # XML serialization and output functions
  2. cdef object GzipFile
  3. from gzip import GzipFile
  4. cdef class SerialisationError(LxmlError):
  5. """A libxml2 error that occurred during serialisation.
  6. """
  7. cdef enum _OutputMethods:
  8. OUTPUT_METHOD_XML
  9. OUTPUT_METHOD_HTML
  10. OUTPUT_METHOD_TEXT
  11. cdef int _findOutputMethod(method) except -1:
  12. if method is None:
  13. return OUTPUT_METHOD_XML
  14. method = method.lower()
  15. if method == "xml":
  16. return OUTPUT_METHOD_XML
  17. if method == "html":
  18. return OUTPUT_METHOD_HTML
  19. if method == "text":
  20. return OUTPUT_METHOD_TEXT
  21. raise ValueError(f"unknown output method {method!r}")
  22. cdef _textToString(xmlNode* c_node, encoding, bint with_tail):
  23. cdef bint needs_conversion
  24. cdef const_xmlChar* c_text
  25. cdef xmlNode* c_text_node
  26. cdef tree.xmlBuffer* c_buffer
  27. cdef int error_result
  28. c_buffer = tree.xmlBufferCreate()
  29. if c_buffer is NULL:
  30. raise MemoryError()
  31. with nogil:
  32. error_result = tree.xmlNodeBufGetContent(c_buffer, c_node)
  33. if with_tail:
  34. c_text_node = _textNodeOrSkip(c_node.next)
  35. while c_text_node is not NULL:
  36. tree.xmlBufferWriteChar(c_buffer, <const_char*>c_text_node.content)
  37. c_text_node = _textNodeOrSkip(c_text_node.next)
  38. c_text = tree.xmlBufferContent(c_buffer)
  39. if error_result < 0 or c_text is NULL:
  40. tree.xmlBufferFree(c_buffer)
  41. raise SerialisationError, "Error during serialisation (out of memory?)"
  42. try:
  43. needs_conversion = 0
  44. if encoding is unicode:
  45. needs_conversion = 1
  46. elif encoding is not None:
  47. # Python prefers lower case encoding names
  48. encoding = encoding.lower()
  49. if encoding not in ('utf8', 'utf-8'):
  50. if encoding == 'ascii':
  51. if isutf8l(c_text, tree.xmlBufferLength(c_buffer)):
  52. # will raise a decode error below
  53. needs_conversion = 1
  54. else:
  55. needs_conversion = 1
  56. if needs_conversion:
  57. text = (<const_char*>c_text)[:tree.xmlBufferLength(c_buffer)].decode('utf8')
  58. if encoding is not unicode:
  59. encoding = _utf8(encoding)
  60. text = python.PyUnicode_AsEncodedString(
  61. text, encoding, 'strict')
  62. else:
  63. text = (<unsigned char*>c_text)[:tree.xmlBufferLength(c_buffer)]
  64. finally:
  65. tree.xmlBufferFree(c_buffer)
  66. return text
  67. cdef _tostring(_Element element, encoding, doctype, method,
  68. bint write_xml_declaration, bint write_complete_document,
  69. bint pretty_print, bint with_tail, int standalone):
  70. """Serialize an element to an encoded string representation of its XML
  71. tree.
  72. """
  73. cdef tree.xmlOutputBuffer* c_buffer
  74. cdef tree.xmlBuf* c_result_buffer
  75. cdef tree.xmlCharEncodingHandler* enchandler
  76. cdef const_char* c_enc
  77. cdef const_xmlChar* c_version
  78. cdef const_xmlChar* c_doctype
  79. cdef int c_method
  80. cdef int error_result
  81. if element is None:
  82. return None
  83. _assertValidNode(element)
  84. c_method = _findOutputMethod(method)
  85. if c_method == OUTPUT_METHOD_TEXT:
  86. return _textToString(element._c_node, encoding, with_tail)
  87. if encoding is None or encoding is unicode:
  88. c_enc = NULL
  89. else:
  90. encoding = _utf8(encoding)
  91. c_enc = _cstr(encoding)
  92. if doctype is None:
  93. c_doctype = NULL
  94. else:
  95. doctype = _utf8(doctype)
  96. c_doctype = _xcstr(doctype)
  97. # it is necessary to *and* find the encoding handler *and* use
  98. # encoding during output
  99. enchandler = tree.xmlFindCharEncodingHandler(c_enc)
  100. if enchandler is NULL and c_enc is not NULL:
  101. if encoding is not None:
  102. encoding = encoding.decode('UTF-8')
  103. raise LookupError, f"unknown encoding: '{encoding}'"
  104. c_buffer = tree.xmlAllocOutputBuffer(enchandler)
  105. if c_buffer is NULL:
  106. tree.xmlCharEncCloseFunc(enchandler)
  107. raise MemoryError()
  108. with nogil:
  109. _writeNodeToBuffer(c_buffer, element._c_node, c_enc, c_doctype, c_method,
  110. write_xml_declaration, write_complete_document,
  111. pretty_print, with_tail, standalone)
  112. tree.xmlOutputBufferFlush(c_buffer)
  113. if c_buffer.conv is not NULL:
  114. c_result_buffer = c_buffer.conv
  115. else:
  116. c_result_buffer = c_buffer.buffer
  117. error_result = c_buffer.error
  118. if error_result != xmlerror.XML_ERR_OK:
  119. tree.xmlOutputBufferClose(c_buffer)
  120. _raiseSerialisationError(error_result)
  121. try:
  122. if encoding is unicode:
  123. result = (<unsigned char*>tree.xmlBufContent(
  124. c_result_buffer))[:tree.xmlBufUse(c_result_buffer)].decode('UTF-8')
  125. else:
  126. result = <bytes>(<unsigned char*>tree.xmlBufContent(
  127. c_result_buffer))[:tree.xmlBufUse(c_result_buffer)]
  128. finally:
  129. error_result = tree.xmlOutputBufferClose(c_buffer)
  130. if error_result == -1:
  131. _raiseSerialisationError(error_result)
  132. return result
  133. cdef bytes _tostringC14N(element_or_tree, bint exclusive, bint with_comments, inclusive_ns_prefixes):
  134. cdef xmlDoc* c_doc
  135. cdef xmlChar* c_buffer = NULL
  136. cdef int byte_count = -1
  137. cdef bytes result
  138. cdef _Document doc
  139. cdef _Element element
  140. cdef xmlChar **c_inclusive_ns_prefixes
  141. if isinstance(element_or_tree, _Element):
  142. _assertValidNode(<_Element>element_or_tree)
  143. doc = (<_Element>element_or_tree)._doc
  144. c_doc = _plainFakeRootDoc(doc._c_doc, (<_Element>element_or_tree)._c_node, 0)
  145. else:
  146. doc = _documentOrRaise(element_or_tree)
  147. _assertValidDoc(doc)
  148. c_doc = doc._c_doc
  149. c_inclusive_ns_prefixes = _convert_ns_prefixes(c_doc.dict, inclusive_ns_prefixes) if inclusive_ns_prefixes else NULL
  150. try:
  151. with nogil:
  152. byte_count = c14n.xmlC14NDocDumpMemory(
  153. c_doc, NULL, exclusive, c_inclusive_ns_prefixes, with_comments, &c_buffer)
  154. finally:
  155. _destroyFakeDoc(doc._c_doc, c_doc)
  156. if c_inclusive_ns_prefixes is not NULL:
  157. python.lxml_free(c_inclusive_ns_prefixes)
  158. if byte_count < 0 or c_buffer is NULL:
  159. if c_buffer is not NULL:
  160. tree.xmlFree(c_buffer)
  161. raise C14NError, "C14N failed"
  162. try:
  163. result = c_buffer[:byte_count]
  164. finally:
  165. tree.xmlFree(c_buffer)
  166. return result
  167. cdef _raiseSerialisationError(int error_result):
  168. if error_result == xmlerror.XML_ERR_NO_MEMORY:
  169. raise MemoryError()
  170. message = ErrorTypes._getName(error_result)
  171. if message is None:
  172. message = f"unknown error {error_result}"
  173. raise SerialisationError, message
  174. ############################################################
  175. # low-level serialisation functions
  176. cdef void _writeDoctype(tree.xmlOutputBuffer* c_buffer,
  177. const_xmlChar* c_doctype) noexcept nogil:
  178. tree.xmlOutputBufferWrite(c_buffer, tree.xmlStrlen(c_doctype),
  179. <const_char*>c_doctype)
  180. tree.xmlOutputBufferWriteString(c_buffer, "\n")
  181. cdef void _writeNodeToBuffer(tree.xmlOutputBuffer* c_buffer,
  182. xmlNode* c_node, const_char* encoding, const_xmlChar* c_doctype,
  183. int c_method, bint write_xml_declaration,
  184. bint write_complete_document,
  185. bint pretty_print, bint with_tail,
  186. int standalone) noexcept nogil:
  187. cdef xmlNode* c_nsdecl_node
  188. cdef xmlDoc* c_doc = c_node.doc
  189. if write_xml_declaration and c_method == OUTPUT_METHOD_XML:
  190. _writeDeclarationToBuffer(c_buffer, c_doc.version, encoding, standalone)
  191. # comments/processing instructions before doctype declaration
  192. if write_complete_document and not c_buffer.error and c_doc.intSubset:
  193. _writePrevSiblings(c_buffer, <xmlNode*>c_doc.intSubset, encoding, pretty_print)
  194. if c_doctype:
  195. _writeDoctype(c_buffer, c_doctype)
  196. # write internal DTD subset, preceding PIs/comments, etc.
  197. if write_complete_document and not c_buffer.error:
  198. if c_doctype is NULL:
  199. _writeDtdToBuffer(c_buffer, c_doc, c_node.name, c_method, encoding)
  200. _writePrevSiblings(c_buffer, c_node, encoding, pretty_print)
  201. c_nsdecl_node = c_node
  202. if not c_node.parent or c_node.parent.type != tree.XML_DOCUMENT_NODE:
  203. # copy the node and add namespaces from parents
  204. # this is required to make libxml write them
  205. c_nsdecl_node = tree.xmlCopyNode(c_node, 2)
  206. if not c_nsdecl_node:
  207. c_buffer.error = xmlerror.XML_ERR_NO_MEMORY
  208. return
  209. _copyParentNamespaces(c_node, c_nsdecl_node)
  210. c_nsdecl_node.parent = c_node.parent
  211. c_nsdecl_node.children = c_node.children
  212. c_nsdecl_node.last = c_node.last
  213. # write node
  214. if c_method == OUTPUT_METHOD_HTML:
  215. tree.htmlNodeDumpFormatOutput(
  216. c_buffer, c_doc, c_nsdecl_node, encoding, pretty_print)
  217. else:
  218. tree.xmlNodeDumpOutput(
  219. c_buffer, c_doc, c_nsdecl_node, 0, pretty_print, encoding)
  220. if c_nsdecl_node is not c_node:
  221. # clean up
  222. c_nsdecl_node.children = c_nsdecl_node.last = NULL
  223. tree.xmlFreeNode(c_nsdecl_node)
  224. if c_buffer.error:
  225. return
  226. # write tail, trailing comments, etc.
  227. if with_tail:
  228. _writeTail(c_buffer, c_node, encoding, c_method, pretty_print)
  229. if write_complete_document:
  230. _writeNextSiblings(c_buffer, c_node, encoding, pretty_print)
  231. if pretty_print:
  232. tree.xmlOutputBufferWrite(c_buffer, 1, "\n")
  233. cdef void _writeDeclarationToBuffer(tree.xmlOutputBuffer* c_buffer,
  234. const_xmlChar* version, const_char* encoding,
  235. int standalone) noexcept nogil:
  236. if version is NULL:
  237. version = <unsigned char*>"1.0"
  238. tree.xmlOutputBufferWrite(c_buffer, 15, "<?xml version='")
  239. tree.xmlOutputBufferWriteString(c_buffer, <const_char*>version)
  240. tree.xmlOutputBufferWrite(c_buffer, 12, "' encoding='")
  241. tree.xmlOutputBufferWriteString(c_buffer, encoding)
  242. if standalone == 0:
  243. tree.xmlOutputBufferWrite(c_buffer, 20, "' standalone='no'?>\n")
  244. elif standalone == 1:
  245. tree.xmlOutputBufferWrite(c_buffer, 21, "' standalone='yes'?>\n")
  246. else:
  247. tree.xmlOutputBufferWrite(c_buffer, 4, "'?>\n")
  248. cdef void _writeDtdToBuffer(tree.xmlOutputBuffer* c_buffer,
  249. xmlDoc* c_doc, const_xmlChar* c_root_name,
  250. int c_method, const_char* encoding) noexcept nogil:
  251. cdef tree.xmlDtd* c_dtd
  252. cdef xmlNode* c_node
  253. cdef char* quotechar
  254. c_dtd = c_doc.intSubset
  255. if not c_dtd or not c_dtd.name:
  256. return
  257. # Name in document type declaration must match the root element tag.
  258. # For XML, case sensitive match, for HTML insensitive.
  259. if c_method == OUTPUT_METHOD_HTML:
  260. if tree.xmlStrcasecmp(c_root_name, c_dtd.name) != 0:
  261. return
  262. else:
  263. if tree.xmlStrcmp(c_root_name, c_dtd.name) != 0:
  264. return
  265. tree.xmlOutputBufferWrite(c_buffer, 10, "<!DOCTYPE ")
  266. tree.xmlOutputBufferWriteString(c_buffer, <const_char*>c_dtd.name)
  267. cdef const_xmlChar* public_id = c_dtd.ExternalID
  268. cdef const_xmlChar* sys_url = c_dtd.SystemID
  269. if public_id and public_id[0] == b'\0':
  270. public_id = NULL
  271. if sys_url and sys_url[0] == b'\0':
  272. sys_url = NULL
  273. if public_id:
  274. tree.xmlOutputBufferWrite(c_buffer, 9, ' PUBLIC "')
  275. tree.xmlOutputBufferWriteString(c_buffer, <const_char*>public_id)
  276. if sys_url:
  277. tree.xmlOutputBufferWrite(c_buffer, 2, '" ')
  278. else:
  279. tree.xmlOutputBufferWrite(c_buffer, 1, '"')
  280. elif sys_url:
  281. tree.xmlOutputBufferWrite(c_buffer, 8, ' SYSTEM ')
  282. if sys_url:
  283. if tree.xmlStrchr(sys_url, b'"'):
  284. quotechar = '\''
  285. else:
  286. quotechar = '"'
  287. tree.xmlOutputBufferWrite(c_buffer, 1, quotechar)
  288. tree.xmlOutputBufferWriteString(c_buffer, <const_char*>sys_url)
  289. tree.xmlOutputBufferWrite(c_buffer, 1, quotechar)
  290. if (not c_dtd.entities and not c_dtd.elements and
  291. not c_dtd.attributes and not c_dtd.notations and
  292. not c_dtd.pentities):
  293. tree.xmlOutputBufferWrite(c_buffer, 2, '>\n')
  294. return
  295. tree.xmlOutputBufferWrite(c_buffer, 3, ' [\n')
  296. if c_dtd.notations and not c_buffer.error:
  297. c_buf = tree.xmlBufferCreate()
  298. if not c_buf:
  299. c_buffer.error = xmlerror.XML_ERR_NO_MEMORY
  300. return
  301. tree.xmlDumpNotationTable(c_buf, <tree.xmlNotationTable*>c_dtd.notations)
  302. tree.xmlOutputBufferWrite(
  303. c_buffer, tree.xmlBufferLength(c_buf),
  304. <const_char*>tree.xmlBufferContent(c_buf))
  305. tree.xmlBufferFree(c_buf)
  306. c_node = c_dtd.children
  307. while c_node and not c_buffer.error:
  308. tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_node, 0, 0, encoding)
  309. c_node = c_node.next
  310. tree.xmlOutputBufferWrite(c_buffer, 3, "]>\n")
  311. cdef void _writeTail(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node,
  312. const_char* encoding, int c_method, bint pretty_print) noexcept nogil:
  313. "Write the element tail."
  314. c_node = c_node.next
  315. while c_node and not c_buffer.error and c_node.type in (
  316. tree.XML_TEXT_NODE, tree.XML_CDATA_SECTION_NODE):
  317. if c_method == OUTPUT_METHOD_HTML:
  318. tree.htmlNodeDumpFormatOutput(
  319. c_buffer, c_node.doc, c_node, encoding, pretty_print)
  320. else:
  321. tree.xmlNodeDumpOutput(
  322. c_buffer, c_node.doc, c_node, 0, pretty_print, encoding)
  323. c_node = c_node.next
  324. cdef void _writePrevSiblings(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node,
  325. const_char* encoding, bint pretty_print) noexcept nogil:
  326. cdef xmlNode* c_sibling
  327. if c_node.parent and _isElement(c_node.parent):
  328. return
  329. # we are at a root node, so add PI and comment siblings
  330. c_sibling = c_node
  331. while c_sibling.prev and \
  332. (c_sibling.prev.type == tree.XML_PI_NODE or
  333. c_sibling.prev.type == tree.XML_COMMENT_NODE):
  334. c_sibling = c_sibling.prev
  335. while c_sibling is not c_node and not c_buffer.error:
  336. tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_sibling, 0,
  337. pretty_print, encoding)
  338. if pretty_print:
  339. tree.xmlOutputBufferWriteString(c_buffer, "\n")
  340. c_sibling = c_sibling.next
  341. cdef void _writeNextSiblings(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node,
  342. const_char* encoding, bint pretty_print) noexcept nogil:
  343. cdef xmlNode* c_sibling
  344. if c_node.parent and _isElement(c_node.parent):
  345. return
  346. # we are at a root node, so add PI and comment siblings
  347. c_sibling = c_node.next
  348. while not c_buffer.error and c_sibling and \
  349. (c_sibling.type == tree.XML_PI_NODE or
  350. c_sibling.type == tree.XML_COMMENT_NODE):
  351. if pretty_print:
  352. tree.xmlOutputBufferWriteString(c_buffer, "\n")
  353. tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_sibling, 0,
  354. pretty_print, encoding)
  355. c_sibling = c_sibling.next
  356. # copied and adapted from libxml2 (xmlBufAttrSerializeTxtContent())
  357. cdef _write_attr_string(tree.xmlOutputBuffer* buf, const char *string):
  358. cdef const char *base
  359. cdef const char *cur
  360. if string == NULL:
  361. return
  362. base = cur = <const char*>string
  363. while cur[0] != 0:
  364. if cur[0] == b'\n':
  365. if base != cur:
  366. tree.xmlOutputBufferWrite(buf, cur - base, base)
  367. tree.xmlOutputBufferWrite(buf, 5, "&#10;")
  368. cur += 1
  369. base = cur
  370. elif cur[0] == b'\r':
  371. if base != cur:
  372. tree.xmlOutputBufferWrite(buf, cur - base, base)
  373. tree.xmlOutputBufferWrite(buf, 5, "&#13;")
  374. cur += 1
  375. base = cur
  376. elif cur[0] == b'\t':
  377. if base != cur:
  378. tree.xmlOutputBufferWrite(buf, cur - base, base)
  379. tree.xmlOutputBufferWrite(buf, 4, "&#9;")
  380. cur += 1
  381. base = cur
  382. elif cur[0] == b'"':
  383. if base != cur:
  384. tree.xmlOutputBufferWrite(buf, cur - base, base)
  385. tree.xmlOutputBufferWrite(buf, 6, "&quot;")
  386. cur += 1
  387. base = cur
  388. elif cur[0] == b'<':
  389. if base != cur:
  390. tree.xmlOutputBufferWrite(buf, cur - base, base)
  391. tree.xmlOutputBufferWrite(buf, 4, "&lt;")
  392. cur += 1
  393. base = cur
  394. elif cur[0] == b'>':
  395. if base != cur:
  396. tree.xmlOutputBufferWrite(buf, cur - base, base)
  397. tree.xmlOutputBufferWrite(buf, 4, "&gt;")
  398. cur += 1
  399. base = cur
  400. elif cur[0] == b'&':
  401. if base != cur:
  402. tree.xmlOutputBufferWrite(buf, cur - base, base)
  403. tree.xmlOutputBufferWrite(buf, 5, "&amp;")
  404. cur += 1
  405. base = cur
  406. else:
  407. # Leave further encoding and escaping to the buffer encoder.
  408. cur += 1
  409. if base != cur:
  410. tree.xmlOutputBufferWrite(buf, cur - base, base)
  411. cdef void _write_cdata_section(tree.xmlOutputBuffer* buf, const char* c_data, const char* c_end):
  412. tree.xmlOutputBufferWrite(buf, 9, "<![CDATA[")
  413. while c_end - c_data > limits.INT_MAX:
  414. tree.xmlOutputBufferWrite(buf, limits.INT_MAX, c_data)
  415. c_data += limits.INT_MAX
  416. tree.xmlOutputBufferWrite(buf, c_end - c_data, c_data)
  417. tree.xmlOutputBufferWrite(buf, 3, "]]>")
  418. cdef _write_cdata_string(tree.xmlOutputBuffer* buf, bytes bstring):
  419. cdef const char* c_data = bstring
  420. cdef const char* c_end = c_data + len(bstring)
  421. cdef const char* c_pos = c_data
  422. cdef bint nothing_written = True
  423. while True:
  424. c_pos = <const char*> cstring_h.memchr(c_pos, b']', c_end - c_pos)
  425. if not c_pos:
  426. break
  427. c_pos += 1
  428. next_char = c_pos[0]
  429. c_pos += 1
  430. if next_char != b']':
  431. continue
  432. # Found ']]', c_pos points to next character.
  433. while c_pos[0] == b']':
  434. c_pos += 1
  435. if c_pos[0] != b'>':
  436. if c_pos == c_end:
  437. break
  438. # c_pos[0] is neither ']' nor '>', continue with next character.
  439. c_pos += 1
  440. continue
  441. # Write section up to ']]' and start next block at trailing '>'.
  442. _write_cdata_section(buf, c_data, c_pos)
  443. nothing_written = False
  444. c_data = c_pos
  445. c_pos += 1
  446. if nothing_written or c_data < c_end:
  447. _write_cdata_section(buf, c_data, c_end)
  448. ############################################################
  449. # output to file-like objects
  450. cdef object io_open
  451. from io import open as io_open
  452. cdef object gzip
  453. import gzip
  454. cdef object getwriter
  455. from codecs import getwriter
  456. cdef object utf8_writer = getwriter('utf8')
  457. cdef object contextmanager
  458. from contextlib import contextmanager
  459. cdef object _open_utf8_file
  460. @contextmanager
  461. def _open_utf8_file(file, compression=0):
  462. file = _getFSPathOrObject(file)
  463. if _isString(file):
  464. if compression:
  465. with gzip.GzipFile(file, mode='wb', compresslevel=compression) as zf:
  466. yield utf8_writer(zf)
  467. else:
  468. with io_open(file, 'w', encoding='utf8') as f:
  469. yield f
  470. else:
  471. if compression:
  472. with gzip.GzipFile(fileobj=file, mode='wb', compresslevel=compression) as zf:
  473. yield utf8_writer(zf)
  474. else:
  475. yield utf8_writer(file)
  476. @cython.final
  477. @cython.internal
  478. cdef class _FilelikeWriter:
  479. cdef object _filelike
  480. cdef object _close_filelike
  481. cdef _ExceptionContext _exc_context
  482. cdef _ErrorLog error_log
  483. def __cinit__(self, filelike, exc_context=None, compression=None, close=False):
  484. if compression is not None and compression > 0:
  485. filelike = GzipFile(
  486. fileobj=filelike, mode='wb', compresslevel=compression)
  487. self._close_filelike = filelike.close
  488. elif close:
  489. self._close_filelike = filelike.close
  490. self._filelike = filelike
  491. if exc_context is None:
  492. self._exc_context = _ExceptionContext()
  493. else:
  494. self._exc_context = exc_context
  495. self.error_log = _ErrorLog()
  496. cdef tree.xmlOutputBuffer* _createOutputBuffer(
  497. self, tree.xmlCharEncodingHandler* enchandler) except NULL:
  498. cdef tree.xmlOutputBuffer* c_buffer
  499. c_buffer = tree.xmlOutputBufferCreateIO(
  500. <tree.xmlOutputWriteCallback>_writeFilelikeWriter, _closeFilelikeWriter,
  501. <python.PyObject*>self, enchandler)
  502. if c_buffer is NULL:
  503. raise IOError, "Could not create I/O writer context."
  504. return c_buffer
  505. cdef int write(self, char* c_buffer, int size) noexcept:
  506. try:
  507. if self._filelike is None:
  508. raise IOError, "File is already closed"
  509. py_buffer = <bytes>c_buffer[:size]
  510. self._filelike.write(py_buffer)
  511. except:
  512. size = -1
  513. self._exc_context._store_raised()
  514. finally:
  515. return size # and swallow any further exceptions
  516. cdef int close(self) noexcept:
  517. retval = 0
  518. try:
  519. if self._close_filelike is not None:
  520. self._close_filelike()
  521. # we should not close the file here as we didn't open it
  522. self._filelike = None
  523. except:
  524. retval = -1
  525. self._exc_context._store_raised()
  526. finally:
  527. return retval # and swallow any further exceptions
  528. cdef int _writeFilelikeWriter(void* ctxt, char* c_buffer, int length) noexcept:
  529. return (<_FilelikeWriter>ctxt).write(c_buffer, length)
  530. cdef int _closeFilelikeWriter(void* ctxt) noexcept:
  531. return (<_FilelikeWriter>ctxt).close()
  532. cdef _tofilelike(f, _Element element, encoding, doctype, method,
  533. bint write_xml_declaration, bint write_doctype,
  534. bint pretty_print, bint with_tail, int standalone,
  535. int compression):
  536. cdef _FilelikeWriter writer = None
  537. cdef tree.xmlOutputBuffer* c_buffer
  538. cdef tree.xmlCharEncodingHandler* enchandler
  539. cdef const_char* c_enc
  540. cdef const_xmlChar* c_doctype
  541. cdef int error_result
  542. c_method = _findOutputMethod(method)
  543. if c_method == OUTPUT_METHOD_TEXT:
  544. data = _textToString(element._c_node, encoding, with_tail)
  545. if compression:
  546. bytes_out = BytesIO()
  547. with GzipFile(fileobj=bytes_out, mode='wb', compresslevel=compression) as gzip_file:
  548. gzip_file.write(data)
  549. data = bytes_out.getvalue()
  550. f = _getFSPathOrObject(f)
  551. if _isString(f):
  552. filename8 = _encodeFilename(f)
  553. with open(filename8, 'wb') as f:
  554. f.write(data)
  555. else:
  556. f.write(data)
  557. return
  558. if encoding is None:
  559. c_enc = NULL
  560. else:
  561. encoding = _utf8(encoding)
  562. c_enc = _cstr(encoding)
  563. if doctype is None:
  564. c_doctype = NULL
  565. else:
  566. doctype = _utf8(doctype)
  567. c_doctype = _xcstr(doctype)
  568. writer = _create_output_buffer(f, c_enc, compression, &c_buffer, close=False)
  569. if writer is None:
  570. with nogil:
  571. error_result = _serialise_node(
  572. c_buffer, c_doctype, c_enc, element._c_node, c_method,
  573. write_xml_declaration, write_doctype, pretty_print, with_tail, standalone)
  574. else:
  575. error_result = _serialise_node(
  576. c_buffer, c_doctype, c_enc, element._c_node, c_method,
  577. write_xml_declaration, write_doctype, pretty_print, with_tail, standalone)
  578. if writer is not None:
  579. writer._exc_context._raise_if_stored()
  580. if error_result != xmlerror.XML_ERR_OK:
  581. _raiseSerialisationError(error_result)
  582. cdef int _serialise_node(tree.xmlOutputBuffer* c_buffer, const_xmlChar* c_doctype,
  583. const_char* c_enc, xmlNode* c_node, int c_method,
  584. bint write_xml_declaration, bint write_doctype, bint pretty_print,
  585. bint with_tail, int standalone) noexcept nogil:
  586. _writeNodeToBuffer(
  587. c_buffer, c_node, c_enc, c_doctype, c_method,
  588. write_xml_declaration, write_doctype, pretty_print, with_tail, standalone)
  589. error_result = c_buffer.error
  590. if error_result == xmlerror.XML_ERR_OK:
  591. error_result = tree.xmlOutputBufferClose(c_buffer)
  592. if error_result != -1:
  593. error_result = xmlerror.XML_ERR_OK
  594. else:
  595. tree.xmlOutputBufferClose(c_buffer)
  596. return error_result
  597. cdef _FilelikeWriter _create_output_buffer(
  598. f, const_char* c_enc, int c_compression,
  599. tree.xmlOutputBuffer** c_buffer_ret, bint close):
  600. cdef tree.xmlOutputBuffer* c_buffer
  601. cdef _FilelikeWriter writer
  602. cdef bytes filename8
  603. enchandler = tree.xmlFindCharEncodingHandler(c_enc)
  604. if enchandler is NULL:
  605. raise LookupError(
  606. f"unknown encoding: '{c_enc.decode('UTF-8') if c_enc is not NULL else u''}'")
  607. try:
  608. f = _getFSPathOrObject(f)
  609. if c_compression and not HAS_ZLIB_COMPRESSION and _isString(f):
  610. # Let "_FilelikeWriter" fall back to Python's GzipFile.
  611. f = open(f, mode="wb")
  612. close = True
  613. if _isString(f):
  614. filename8 = _encodeFilename(f)
  615. if b'%' in filename8 and (
  616. # Exclude absolute Windows paths and file:// URLs.
  617. _isFilePath(<const xmlChar*>filename8) not in (NO_FILE_PATH, ABS_WIN_FILE_PATH)
  618. or filename8[:7].lower() == b'file://'):
  619. # A file path (not a URL) containing the '%' URL escape character.
  620. # libxml2 uses URL-unescaping on these, so escape the path before passing it in.
  621. filename8 = filename8.replace(b'%', b'%25')
  622. c_buffer = tree.xmlOutputBufferCreateFilename(
  623. _cstr(filename8), enchandler, c_compression)
  624. if c_buffer is NULL:
  625. python.PyErr_SetFromErrno(IOError) # raises IOError
  626. writer = None
  627. elif hasattr(f, 'write'):
  628. writer = _FilelikeWriter(f, compression=c_compression, close=close)
  629. c_buffer = writer._createOutputBuffer(enchandler)
  630. else:
  631. raise TypeError(
  632. f"File or filename expected, got '{python._fqtypename(f).decode('UTF-8')}'")
  633. except:
  634. tree.xmlCharEncCloseFunc(enchandler)
  635. raise
  636. c_buffer_ret[0] = c_buffer
  637. return writer
  638. cdef xmlChar **_convert_ns_prefixes(tree.xmlDict* c_dict, ns_prefixes) except NULL:
  639. cdef size_t i, num_ns_prefixes = len(ns_prefixes)
  640. # Need to allocate one extra memory block to handle last NULL entry
  641. c_ns_prefixes = <xmlChar **>python.lxml_malloc(num_ns_prefixes + 1, sizeof(xmlChar*))
  642. if not c_ns_prefixes:
  643. raise MemoryError()
  644. i = 0
  645. try:
  646. for prefix in ns_prefixes:
  647. prefix_utf = _utf8(prefix)
  648. c_prefix_len = len(prefix_utf)
  649. if c_prefix_len > limits.INT_MAX:
  650. raise ValueError("Prefix too long")
  651. c_prefix = tree.xmlDictExists(c_dict, _xcstr(prefix_utf), <int> c_prefix_len)
  652. if c_prefix:
  653. # unknown prefixes do not need to get serialised
  654. c_ns_prefixes[i] = <xmlChar*>c_prefix
  655. i += 1
  656. except:
  657. python.lxml_free(c_ns_prefixes)
  658. raise
  659. c_ns_prefixes[i] = NULL # append end marker
  660. return c_ns_prefixes
  661. cdef _tofilelikeC14N(f, _Element element, bint exclusive, bint with_comments,
  662. int compression, inclusive_ns_prefixes):
  663. cdef _FilelikeWriter writer = None
  664. cdef tree.xmlOutputBuffer* c_buffer
  665. cdef xmlChar **c_inclusive_ns_prefixes = NULL
  666. cdef char* c_filename
  667. cdef xmlDoc* c_base_doc
  668. cdef xmlDoc* c_doc
  669. cdef int bytes_count, error = 0
  670. c_base_doc = element._c_node.doc
  671. c_doc = _fakeRootDoc(c_base_doc, element._c_node)
  672. try:
  673. c_inclusive_ns_prefixes = (
  674. _convert_ns_prefixes(c_doc.dict, inclusive_ns_prefixes)
  675. if inclusive_ns_prefixes else NULL)
  676. f = _getFSPathOrObject(f)
  677. close = False
  678. if compression and not HAS_ZLIB_COMPRESSION and _isString(f):
  679. # Let "_FilelikeWriter" fall back to Python's GzipFile.
  680. f = open(f, mode="wb")
  681. close = True
  682. if _isString(f):
  683. filename8 = _encodeFilename(f)
  684. c_filename = _cstr(filename8)
  685. with nogil:
  686. error = c14n.xmlC14NDocSave(
  687. c_doc, NULL, exclusive, c_inclusive_ns_prefixes,
  688. with_comments, c_filename, compression)
  689. elif hasattr(f, 'write'):
  690. writer = _FilelikeWriter(f, compression=compression, close=close)
  691. c_buffer = writer._createOutputBuffer(NULL)
  692. try:
  693. with writer.error_log:
  694. bytes_count = c14n.xmlC14NDocSaveTo(
  695. c_doc, NULL, exclusive, c_inclusive_ns_prefixes,
  696. with_comments, c_buffer)
  697. finally:
  698. error = tree.xmlOutputBufferClose(c_buffer)
  699. if bytes_count < 0:
  700. error = bytes_count
  701. elif error != -1:
  702. error = xmlerror.XML_ERR_OK
  703. else:
  704. raise TypeError(f"File or filename expected, got '{python._fqtypename(f).decode('UTF-8')}'")
  705. finally:
  706. _destroyFakeDoc(c_base_doc, c_doc)
  707. if c_inclusive_ns_prefixes is not NULL:
  708. python.lxml_free(c_inclusive_ns_prefixes)
  709. if writer is not None:
  710. writer._exc_context._raise_if_stored()
  711. if error < 0:
  712. message = "C14N failed"
  713. if writer is not None:
  714. errors = writer.error_log
  715. if len(errors):
  716. message = errors[0].message
  717. raise C14NError(message)
  718. # C14N 2.0
  719. def canonicalize(xml_data=None, *, out=None, from_file=None, **options):
  720. """Convert XML to its C14N 2.0 serialised form.
  721. If *out* is provided, it must be a file or file-like object that receives
  722. the serialised canonical XML output (text, not bytes) through its ``.write()``
  723. method. To write to a file, open it in text mode with encoding "utf-8".
  724. If *out* is not provided, this function returns the output as text string.
  725. Either *xml_data* (an XML string, tree or Element) or *file*
  726. (a file path or file-like object) must be provided as input.
  727. The configuration options are the same as for the ``C14NWriterTarget``.
  728. """
  729. if xml_data is None and from_file is None:
  730. raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")
  731. sio = None
  732. if out is None:
  733. sio = out = StringIO()
  734. target = C14NWriterTarget(out.write, **options)
  735. if xml_data is not None and not isinstance(xml_data, basestring):
  736. _tree_to_target(xml_data, target)
  737. return sio.getvalue() if sio is not None else None
  738. cdef _FeedParser parser = XMLParser(
  739. target=target,
  740. attribute_defaults=True,
  741. collect_ids=False,
  742. )
  743. if xml_data is not None:
  744. parser.feed(xml_data)
  745. parser.close()
  746. elif from_file is not None:
  747. try:
  748. _parseDocument(from_file, parser, base_url=None)
  749. except _TargetParserResult:
  750. pass
  751. return sio.getvalue() if sio is not None else None
  752. cdef _tree_to_target(element, target):
  753. for event, elem in iterwalk(element, events=('start', 'end', 'start-ns', 'comment', 'pi')):
  754. text = None
  755. if event == 'start':
  756. target.start(elem.tag, elem.attrib)
  757. text = elem.text
  758. elif event == 'end':
  759. target.end(elem.tag)
  760. text = elem.tail
  761. elif event == 'start-ns':
  762. target.start_ns(*elem)
  763. continue
  764. elif event == 'comment':
  765. target.comment(elem.text)
  766. text = elem.tail
  767. elif event == 'pi':
  768. target.pi(elem.target, elem.text)
  769. text = elem.tail
  770. if text:
  771. target.data(text)
  772. return target.close()
  773. cdef object _looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match
  774. cdef class C14NWriterTarget:
  775. """
  776. Canonicalization writer target for the XMLParser.
  777. Serialises parse events to XML C14N 2.0.
  778. Configuration options:
  779. - *with_comments*: set to true to include comments
  780. - *strip_text*: set to true to strip whitespace before and after text content
  781. - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"
  782. - *qname_aware_tags*: a set of qname aware tag names in which prefixes
  783. should be replaced in text content
  784. - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes
  785. should be replaced in text content
  786. - *exclude_attrs*: a set of attribute names that should not be serialised
  787. - *exclude_tags*: a set of tag names that should not be serialised
  788. """
  789. cdef object _write
  790. cdef list _data
  791. cdef set _qname_aware_tags
  792. cdef object _find_qname_aware_attrs
  793. cdef list _declared_ns_stack
  794. cdef list _ns_stack
  795. cdef dict _prefix_map
  796. cdef list _preserve_space
  797. cdef tuple _pending_start
  798. cdef set _exclude_tags
  799. cdef set _exclude_attrs
  800. cdef Py_ssize_t _ignored_depth
  801. cdef bint _with_comments
  802. cdef bint _strip_text
  803. cdef bint _rewrite_prefixes
  804. cdef bint _root_seen
  805. cdef bint _root_done
  806. def __init__(self, write, *,
  807. with_comments=False, strip_text=False, rewrite_prefixes=False,
  808. qname_aware_tags=None, qname_aware_attrs=None,
  809. exclude_attrs=None, exclude_tags=None):
  810. self._write = write
  811. self._data = []
  812. self._with_comments = with_comments
  813. self._strip_text = strip_text
  814. self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None
  815. self._exclude_tags = set(exclude_tags) if exclude_tags else None
  816. self._rewrite_prefixes = rewrite_prefixes
  817. if qname_aware_tags:
  818. self._qname_aware_tags = set(qname_aware_tags)
  819. else:
  820. self._qname_aware_tags = None
  821. if qname_aware_attrs:
  822. self._find_qname_aware_attrs = set(qname_aware_attrs).intersection
  823. else:
  824. self._find_qname_aware_attrs = None
  825. # Stack with globally and newly declared namespaces as (uri, prefix) pairs.
  826. self._declared_ns_stack = [[
  827. ("http://www.w3.org/XML/1998/namespace", "xml"),
  828. ]]
  829. # Stack with user declared namespace prefixes as (uri, prefix) pairs.
  830. self._ns_stack = []
  831. if not rewrite_prefixes:
  832. self._ns_stack.append(_DEFAULT_NAMESPACE_PREFIXES_ITEMS)
  833. self._ns_stack.append([])
  834. self._prefix_map = {}
  835. self._preserve_space = [False]
  836. self._pending_start = None
  837. self._ignored_depth = 0
  838. self._root_seen = False
  839. self._root_done = False
  840. def _iter_namespaces(self, ns_stack):
  841. for namespaces in reversed(ns_stack):
  842. if namespaces: # almost no element declares new namespaces
  843. yield from namespaces
  844. cdef _resolve_prefix_name(self, prefixed_name):
  845. prefix, name = prefixed_name.split(':', 1)
  846. for uri, p in self._iter_namespaces(self._ns_stack):
  847. if p == prefix:
  848. return f'{{{uri}}}{name}'
  849. raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')
  850. cdef _qname(self, qname, uri=None):
  851. if uri is None:
  852. uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)
  853. else:
  854. tag = qname
  855. prefixes_seen = set()
  856. for u, prefix in self._iter_namespaces(self._declared_ns_stack):
  857. if u == uri and prefix not in prefixes_seen:
  858. return f'{prefix}:{tag}' if prefix else tag, tag, uri
  859. prefixes_seen.add(prefix)
  860. # Not declared yet => add new declaration.
  861. if self._rewrite_prefixes:
  862. if uri in self._prefix_map:
  863. prefix = self._prefix_map[uri]
  864. else:
  865. prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'
  866. self._declared_ns_stack[-1].append((uri, prefix))
  867. return f'{prefix}:{tag}', tag, uri
  868. if not uri and '' not in prefixes_seen:
  869. # No default namespace declared => no prefix needed.
  870. return tag, tag, uri
  871. for u, prefix in self._iter_namespaces(self._ns_stack):
  872. if u == uri:
  873. self._declared_ns_stack[-1].append((uri, prefix))
  874. return f'{prefix}:{tag}' if prefix else tag, tag, uri
  875. if not uri:
  876. # As soon as a default namespace is defined,
  877. # anything that has no namespace (and thus, no prefix) goes there.
  878. return tag, tag, uri
  879. raise ValueError(f'Namespace "{uri}" of name "{tag}" is not declared in scope')
  880. def data(self, data):
  881. if not self._ignored_depth:
  882. self._data.append(data)
  883. cdef _flush(self):
  884. cdef unicode data = ''.join(self._data)
  885. del self._data[:]
  886. if self._strip_text and not self._preserve_space[-1]:
  887. data = data.strip()
  888. if self._pending_start is not None:
  889. (tag, attrs, new_namespaces), self._pending_start = self._pending_start, None
  890. qname_text = data if ':' in data and _looks_like_prefix_name(data) else None
  891. self._start(tag, attrs, new_namespaces, qname_text)
  892. if qname_text is not None:
  893. return
  894. if data and self._root_seen:
  895. self._write(_escape_cdata_c14n(data))
  896. def start_ns(self, prefix, uri):
  897. if self._ignored_depth:
  898. return
  899. # we may have to resolve qnames in text content
  900. if self._data:
  901. self._flush()
  902. self._ns_stack[-1].append((uri, prefix))
  903. def start(self, tag, attrs):
  904. if self._exclude_tags is not None and (
  905. self._ignored_depth or tag in self._exclude_tags):
  906. self._ignored_depth += 1
  907. return
  908. if self._data:
  909. self._flush()
  910. new_namespaces = []
  911. self._declared_ns_stack.append(new_namespaces)
  912. if self._qname_aware_tags is not None and tag in self._qname_aware_tags:
  913. # Need to parse text first to see if it requires a prefix declaration.
  914. self._pending_start = (tag, attrs, new_namespaces)
  915. return
  916. self._start(tag, attrs, new_namespaces)
  917. cdef _start(self, tag, attrs, new_namespaces, qname_text=None):
  918. if self._exclude_attrs is not None and attrs:
  919. attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}
  920. qnames = {tag, *attrs}
  921. resolved_names = {}
  922. # Resolve prefixes in attribute and tag text.
  923. if qname_text is not None:
  924. qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)
  925. qnames.add(qname)
  926. if self._find_qname_aware_attrs is not None and attrs:
  927. qattrs = self._find_qname_aware_attrs(attrs)
  928. if qattrs:
  929. for attr_name in qattrs:
  930. value = attrs[attr_name]
  931. if _looks_like_prefix_name(value):
  932. qname = resolved_names[value] = self._resolve_prefix_name(value)
  933. qnames.add(qname)
  934. else:
  935. qattrs = None
  936. else:
  937. qattrs = None
  938. # Assign prefixes in lexicographical order of used URIs.
  939. parsed_qnames = {n: self._qname(n) for n in sorted(
  940. qnames, key=lambda n: n.split('}', 1))}
  941. # Write namespace declarations in prefix order ...
  942. if new_namespaces:
  943. attr_list = [
  944. ('xmlns:' + prefix if prefix else 'xmlns', uri)
  945. for uri, prefix in new_namespaces
  946. ]
  947. attr_list.sort()
  948. else:
  949. # almost always empty
  950. attr_list = []
  951. # ... followed by attributes in URI+name order
  952. if attrs:
  953. for k, v in sorted(attrs.items()):
  954. if qattrs is not None and k in qattrs and v in resolved_names:
  955. v = parsed_qnames[resolved_names[v]][0]
  956. attr_qname, attr_name, uri = parsed_qnames[k]
  957. # No prefix for attributes in default ('') namespace.
  958. attr_list.append((attr_qname if uri else attr_name, v))
  959. # Honour xml:space attributes.
  960. space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')
  961. self._preserve_space.append(
  962. space_behaviour == 'preserve' if space_behaviour
  963. else self._preserve_space[-1])
  964. # Write the tag.
  965. write = self._write
  966. write('<' + parsed_qnames[tag][0])
  967. if attr_list:
  968. write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))
  969. write('>')
  970. # Write the resolved qname text content.
  971. if qname_text is not None:
  972. write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))
  973. self._root_seen = True
  974. self._ns_stack.append([])
  975. def end(self, tag):
  976. if self._ignored_depth:
  977. self._ignored_depth -= 1
  978. return
  979. if self._data:
  980. self._flush()
  981. self._write(f'</{self._qname(tag)[0]}>')
  982. self._preserve_space.pop()
  983. self._root_done = len(self._preserve_space) == 1
  984. self._declared_ns_stack.pop()
  985. self._ns_stack.pop()
  986. def comment(self, text):
  987. if not self._with_comments:
  988. return
  989. if self._ignored_depth:
  990. return
  991. if self._root_done:
  992. self._write('\n')
  993. elif self._root_seen and self._data:
  994. self._flush()
  995. self._write(f'<!--{_escape_cdata_c14n(text)}-->')
  996. if not self._root_seen:
  997. self._write('\n')
  998. def pi(self, target, data):
  999. if self._ignored_depth:
  1000. return
  1001. if self._root_done:
  1002. self._write('\n')
  1003. elif self._root_seen and self._data:
  1004. self._flush()
  1005. self._write(
  1006. f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')
  1007. if not self._root_seen:
  1008. self._write('\n')
  1009. def close(self):
  1010. return None
  1011. cdef _raise_serialization_error(text):
  1012. raise TypeError("cannot serialize %r (type %s)" % (text, type(text).__name__))
  1013. cdef unicode _escape_cdata_c14n(stext):
  1014. # escape character data
  1015. cdef unicode text
  1016. cdef Py_UCS4 ch
  1017. cdef Py_ssize_t start = 0, pos = 0
  1018. cdef list substrings = None
  1019. try:
  1020. text = unicode(stext)
  1021. except (TypeError, AttributeError):
  1022. return _raise_serialization_error(stext)
  1023. for pos, ch in enumerate(text):
  1024. if ch == '&':
  1025. escape = '&amp;'
  1026. elif ch == '<':
  1027. escape = '&lt;'
  1028. elif ch == '>':
  1029. escape = '&gt;'
  1030. elif ch == '\r':
  1031. escape = '&#xD;'
  1032. else:
  1033. continue
  1034. if substrings is None:
  1035. substrings = []
  1036. if pos > start:
  1037. substrings.append(text[start:pos])
  1038. substrings.append(escape)
  1039. start = pos + 1
  1040. if substrings is None:
  1041. return text
  1042. if pos >= start:
  1043. substrings.append(text[start:pos+1])
  1044. return ''.join(substrings)
  1045. cdef unicode _escape_attrib_c14n(stext):
  1046. # escape attribute value
  1047. cdef unicode text
  1048. cdef Py_UCS4 ch
  1049. cdef Py_ssize_t start = 0, pos = 0
  1050. cdef list substrings = None
  1051. try:
  1052. text = unicode(stext)
  1053. except (TypeError, AttributeError):
  1054. return _raise_serialization_error(stext)
  1055. for pos, ch in enumerate(text):
  1056. if ch == '&':
  1057. escape = '&amp;'
  1058. elif ch == '<':
  1059. escape = '&lt;'
  1060. elif ch == '"':
  1061. escape = '&quot;'
  1062. elif ch == '\t':
  1063. escape = '&#x9;'
  1064. elif ch == '\n':
  1065. escape = '&#xA;'
  1066. elif ch == '\r':
  1067. escape = '&#xD;'
  1068. else:
  1069. continue
  1070. if substrings is None:
  1071. substrings = []
  1072. if pos > start:
  1073. substrings.append(text[start:pos])
  1074. substrings.append(escape)
  1075. start = pos + 1
  1076. if substrings is None:
  1077. return text
  1078. if pos >= start:
  1079. substrings.append(text[start:pos+1])
  1080. return ''.join(substrings)
  1081. # incremental serialisation
  1082. cdef class xmlfile:
  1083. """xmlfile(self, output_file, encoding=None, compression=None, close=False, buffered=True)
  1084. A simple mechanism for incremental XML serialisation.
  1085. Usage example::
  1086. with xmlfile("somefile.xml", encoding='utf-8') as xf:
  1087. xf.write_declaration(standalone=True)
  1088. xf.write_doctype('<!DOCTYPE root SYSTEM "some.dtd">')
  1089. # generate an element (the root element)
  1090. with xf.element('root'):
  1091. # write a complete Element into the open root element
  1092. xf.write(etree.Element('test'))
  1093. # generate and write more Elements, e.g. through iterparse
  1094. for element in generate_some_elements():
  1095. # serialise generated elements into the XML file
  1096. xf.write(element)
  1097. # or write multiple Elements or strings at once
  1098. xf.write(etree.Element('start'), "text", etree.Element('end'))
  1099. If 'output_file' is a file(-like) object, passing ``close=True`` will
  1100. close it when exiting the context manager. By default, it is left
  1101. to the owner to do that. When a file path is used, lxml will take care
  1102. of opening and closing the file itself. Also, when a compression level
  1103. is set, lxml will deliberately close the file to make sure all data gets
  1104. compressed and written.
  1105. Setting ``buffered=False`` will flush the output after each operation,
  1106. such as opening or closing an ``xf.element()`` block or calling
  1107. ``xf.write()``. Alternatively, calling ``xf.flush()`` can be used to
  1108. explicitly flush any pending output when buffering is enabled.
  1109. """
  1110. cdef object output_file
  1111. cdef bytes encoding
  1112. cdef _IncrementalFileWriter writer
  1113. cdef _AsyncIncrementalFileWriter async_writer
  1114. cdef int compresslevel
  1115. cdef bint close
  1116. cdef bint buffered
  1117. cdef int method
  1118. def __init__(self, output_file not None, encoding=None, compression=None,
  1119. close=False, buffered=True):
  1120. self.output_file = output_file
  1121. self.encoding = _utf8orNone(encoding)
  1122. self.compresslevel = compression or 0
  1123. self.close = close
  1124. self.buffered = buffered
  1125. self.method = OUTPUT_METHOD_XML
  1126. def __enter__(self):
  1127. assert self.output_file is not None
  1128. self.writer = _IncrementalFileWriter(
  1129. self.output_file, self.encoding, self.compresslevel,
  1130. self.close, self.buffered, self.method)
  1131. return self.writer
  1132. def __exit__(self, exc_type, exc_val, exc_tb):
  1133. if self.writer is not None:
  1134. old_writer, self.writer = self.writer, None
  1135. raise_on_error = exc_type is None
  1136. old_writer._close(raise_on_error)
  1137. if self.close:
  1138. self.output_file = None
  1139. async def __aenter__(self):
  1140. assert self.output_file is not None
  1141. if isinstance(self.output_file, basestring):
  1142. raise TypeError("Cannot asynchronously write to a plain file")
  1143. if not hasattr(self.output_file, 'write'):
  1144. raise TypeError("Output file needs an async .write() method")
  1145. self.async_writer = _AsyncIncrementalFileWriter(
  1146. self.output_file, self.encoding, self.compresslevel,
  1147. self.close, self.buffered, self.method)
  1148. return self.async_writer
  1149. async def __aexit__(self, exc_type, exc_val, exc_tb):
  1150. if self.async_writer is not None:
  1151. old_writer, self.async_writer = self.async_writer, None
  1152. raise_on_error = exc_type is None
  1153. await old_writer._close(raise_on_error)
  1154. if self.close:
  1155. self.output_file = None
  1156. cdef class htmlfile(xmlfile):
  1157. """htmlfile(self, output_file, encoding=None, compression=None, close=False, buffered=True)
  1158. A simple mechanism for incremental HTML serialisation. Works the same as
  1159. xmlfile.
  1160. """
  1161. def __init__(self, *args, **kwargs):
  1162. super().__init__(*args, **kwargs)
  1163. self.method = OUTPUT_METHOD_HTML
  1164. cdef enum _IncrementalFileWriterStatus:
  1165. WRITER_STARTING = 0
  1166. WRITER_DECL_WRITTEN = 1
  1167. WRITER_DTD_WRITTEN = 2
  1168. WRITER_IN_ELEMENT = 3
  1169. WRITER_FINISHED = 4
  1170. @cython.final
  1171. @cython.internal
  1172. cdef class _IncrementalFileWriter:
  1173. cdef tree.xmlOutputBuffer* _c_out
  1174. cdef bytes _encoding
  1175. cdef const_char* _c_encoding
  1176. cdef _FilelikeWriter _target
  1177. cdef list _element_stack
  1178. cdef int _status
  1179. cdef int _method
  1180. cdef bint _buffered
  1181. def __cinit__(self, outfile, bytes encoding, int compresslevel, bint close,
  1182. bint buffered, int method):
  1183. self._status = WRITER_STARTING
  1184. self._element_stack = []
  1185. if encoding is None:
  1186. # We always need a document encoding to make the attribute serialisation
  1187. # of libxml2 identical to ours.
  1188. encoding = b'ASCII'
  1189. self._encoding = encoding
  1190. self._c_encoding = _cstr(encoding)
  1191. self._buffered = buffered
  1192. self._target = _create_output_buffer(
  1193. outfile, self._c_encoding, compresslevel, &self._c_out, close)
  1194. self._method = method
  1195. def __dealloc__(self):
  1196. if self._c_out is not NULL:
  1197. tree.xmlOutputBufferClose(self._c_out)
  1198. def write_declaration(self, version=None, standalone=None, doctype=None):
  1199. """write_declaration(self, version=None, standalone=None, doctype=None)
  1200. Write an XML declaration and (optionally) a doctype into the file.
  1201. """
  1202. assert self._c_out is not NULL
  1203. cdef const_xmlChar* c_version
  1204. cdef int c_standalone
  1205. if self._method != OUTPUT_METHOD_XML:
  1206. raise LxmlSyntaxError("only XML documents have declarations")
  1207. if self._status >= WRITER_DECL_WRITTEN:
  1208. raise LxmlSyntaxError("XML declaration already written")
  1209. version = _utf8orNone(version)
  1210. c_version = _xcstr(version) if version is not None else NULL
  1211. doctype = _utf8orNone(doctype)
  1212. if standalone is None:
  1213. c_standalone = -1
  1214. else:
  1215. c_standalone = 1 if standalone else 0
  1216. _writeDeclarationToBuffer(self._c_out, c_version, self._c_encoding, c_standalone)
  1217. if doctype is not None:
  1218. _writeDoctype(self._c_out, _xcstr(doctype))
  1219. self._status = WRITER_DTD_WRITTEN
  1220. else:
  1221. self._status = WRITER_DECL_WRITTEN
  1222. if not self._buffered:
  1223. tree.xmlOutputBufferFlush(self._c_out)
  1224. self._handle_error(self._c_out.error)
  1225. def write_doctype(self, doctype):
  1226. """write_doctype(self, doctype)
  1227. Writes the given doctype declaration verbatimly into the file.
  1228. """
  1229. assert self._c_out is not NULL
  1230. if doctype is None:
  1231. return
  1232. if self._status >= WRITER_DTD_WRITTEN:
  1233. raise LxmlSyntaxError("DOCTYPE already written or cannot write it here")
  1234. doctype = _utf8(doctype)
  1235. _writeDoctype(self._c_out, _xcstr(doctype))
  1236. self._status = WRITER_DTD_WRITTEN
  1237. if not self._buffered:
  1238. tree.xmlOutputBufferFlush(self._c_out)
  1239. self._handle_error(self._c_out.error)
  1240. def method(self, method):
  1241. """method(self, method)
  1242. Returns a context manager that overrides and restores the output method.
  1243. method is one of (None, 'xml', 'html') where None means 'xml'.
  1244. """
  1245. assert self._c_out is not NULL
  1246. c_method = self._method if method is None else _findOutputMethod(method)
  1247. return _MethodChanger(self, c_method)
  1248. def element(self, tag, attrib=None, nsmap=None, method=None, **_extra):
  1249. """element(self, tag, attrib=None, nsmap=None, method, **_extra)
  1250. Returns a context manager that writes an opening and closing tag.
  1251. method is one of (None, 'xml', 'html') where None means 'xml'.
  1252. """
  1253. assert self._c_out is not NULL
  1254. attributes = []
  1255. if attrib is not None:
  1256. for name, value in _iter_attrib(attrib):
  1257. if name not in _extra:
  1258. ns, name = _getNsTag(name)
  1259. attributes.append((ns, name, _utf8(value)))
  1260. if _extra:
  1261. for name, value in _extra.iteritems():
  1262. ns, name = _getNsTag(name)
  1263. attributes.append((ns, name, _utf8(value)))
  1264. reversed_nsmap = {}
  1265. if nsmap:
  1266. for prefix, ns in nsmap.items():
  1267. if prefix is not None:
  1268. prefix = _utf8(prefix)
  1269. _prefixValidOrRaise(prefix)
  1270. reversed_nsmap[_utf8(ns)] = prefix
  1271. ns, name = _getNsTag(tag)
  1272. c_method = self._method if method is None else _findOutputMethod(method)
  1273. return _FileWriterElement(self, (ns, name, attributes, reversed_nsmap), c_method)
  1274. cdef _write_qname(self, bytes name, bytes prefix):
  1275. if prefix: # empty bytes for no prefix (not None to allow sorting)
  1276. tree.xmlOutputBufferWrite(self._c_out, len(prefix), _cstr(prefix))
  1277. tree.xmlOutputBufferWrite(self._c_out, 1, ':')
  1278. tree.xmlOutputBufferWrite(self._c_out, len(name), _cstr(name))
  1279. cdef _write_start_element(self, element_config):
  1280. if self._status > WRITER_IN_ELEMENT:
  1281. raise LxmlSyntaxError("cannot append trailing element to complete XML document")
  1282. ns, name, attributes, nsmap = element_config
  1283. flat_namespace_map, new_namespaces = self._collect_namespaces(nsmap)
  1284. prefix = self._find_prefix(ns, flat_namespace_map, new_namespaces)
  1285. tree.xmlOutputBufferWrite(self._c_out, 1, '<')
  1286. self._write_qname(name, prefix)
  1287. self._write_attributes_and_namespaces(
  1288. attributes, flat_namespace_map, new_namespaces)
  1289. tree.xmlOutputBufferWrite(self._c_out, 1, '>')
  1290. if not self._buffered:
  1291. tree.xmlOutputBufferFlush(self._c_out)
  1292. self._handle_error(self._c_out.error)
  1293. self._element_stack.append((ns, name, prefix, flat_namespace_map))
  1294. self._status = WRITER_IN_ELEMENT
  1295. cdef _write_attributes_and_namespaces(self, list attributes,
  1296. dict flat_namespace_map,
  1297. list new_namespaces):
  1298. if attributes:
  1299. # _find_prefix() may append to new_namespaces => build them first
  1300. attributes = [
  1301. (self._find_prefix(ns, flat_namespace_map, new_namespaces), name, value)
  1302. for ns, name, value in attributes ]
  1303. if new_namespaces:
  1304. new_namespaces.sort()
  1305. self._write_attributes_list(new_namespaces)
  1306. if attributes:
  1307. self._write_attributes_list(attributes)
  1308. cdef _write_attributes_list(self, list attributes):
  1309. for prefix, name, value in attributes:
  1310. tree.xmlOutputBufferWrite(self._c_out, 1, ' ')
  1311. self._write_qname(name, prefix)
  1312. tree.xmlOutputBufferWrite(self._c_out, 2, '="')
  1313. _write_attr_string(self._c_out, _cstr(value))
  1314. tree.xmlOutputBufferWrite(self._c_out, 1, '"')
  1315. cdef _write_end_element(self, element_config):
  1316. if self._status != WRITER_IN_ELEMENT:
  1317. raise LxmlSyntaxError("not in an element")
  1318. if not self._element_stack or self._element_stack[-1][:2] != element_config[:2]:
  1319. raise LxmlSyntaxError("inconsistent exit action in context manager")
  1320. # If previous write operations failed, the context manager exit might still call us.
  1321. # That is ok, but we stop writing closing tags and handling errors in that case.
  1322. # For all non-I/O errors, we continue writing closing tags if we can.
  1323. ok_to_write = self._c_out.error == xmlerror.XML_ERR_OK
  1324. name, prefix = self._element_stack.pop()[1:3]
  1325. if ok_to_write:
  1326. tree.xmlOutputBufferWrite(self._c_out, 2, '</')
  1327. self._write_qname(name, prefix)
  1328. tree.xmlOutputBufferWrite(self._c_out, 1, '>')
  1329. if not self._element_stack:
  1330. self._status = WRITER_FINISHED
  1331. if ok_to_write:
  1332. if not self._buffered:
  1333. tree.xmlOutputBufferFlush(self._c_out)
  1334. self._handle_error(self._c_out.error)
  1335. cdef _find_prefix(self, bytes href, dict flat_namespaces_map, list new_namespaces):
  1336. if href is None:
  1337. return None
  1338. if href in flat_namespaces_map:
  1339. return flat_namespaces_map[href]
  1340. # need to create a new prefix
  1341. prefixes = flat_namespaces_map.values()
  1342. i = 0
  1343. while True:
  1344. prefix = _utf8('ns%d' % i)
  1345. if prefix not in prefixes:
  1346. new_namespaces.append((b'xmlns', prefix, href))
  1347. flat_namespaces_map[href] = prefix
  1348. return prefix
  1349. i += 1
  1350. cdef _collect_namespaces(self, dict nsmap):
  1351. new_namespaces = []
  1352. flat_namespaces_map = {}
  1353. for ns, prefix in nsmap.iteritems():
  1354. flat_namespaces_map[ns] = prefix
  1355. if prefix is None:
  1356. # use empty bytes rather than None to allow sorting
  1357. new_namespaces.append((b'', b'xmlns', ns))
  1358. else:
  1359. new_namespaces.append((b'xmlns', prefix, ns))
  1360. # merge in flat namespace map of parent
  1361. if self._element_stack:
  1362. for ns, prefix in (<dict>self._element_stack[-1][-1]).iteritems():
  1363. if flat_namespaces_map.get(ns) is None:
  1364. # unknown or empty prefix => prefer a 'real' prefix
  1365. flat_namespaces_map[ns] = prefix
  1366. return flat_namespaces_map, new_namespaces
  1367. def write(self, *args, bint with_tail=True, bint pretty_print=False, method=None):
  1368. """write(self, *args, with_tail=True, pretty_print=False, method=None)
  1369. Write subtrees or strings into the file.
  1370. If method is not None, it should be one of ('html', 'xml', 'text')
  1371. to temporarily override the output method.
  1372. """
  1373. assert self._c_out is not NULL
  1374. c_method = self._method if method is None else _findOutputMethod(method)
  1375. for content in args:
  1376. if _isString(content):
  1377. if self._status != WRITER_IN_ELEMENT:
  1378. if self._status > WRITER_IN_ELEMENT or content.strip():
  1379. raise LxmlSyntaxError("not in an element")
  1380. bstring = _utf8(content)
  1381. if not bstring:
  1382. continue
  1383. ns, name, _, _ = self._element_stack[-1]
  1384. if (c_method == OUTPUT_METHOD_HTML and
  1385. ns in (None, b'http://www.w3.org/1999/xhtml') and
  1386. name in (b'script', b'style')):
  1387. tree.xmlOutputBufferWrite(self._c_out, len(bstring), _cstr(bstring))
  1388. else:
  1389. tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(bstring), NULL)
  1390. elif isinstance(content, CDATA):
  1391. if self._status > WRITER_IN_ELEMENT:
  1392. raise LxmlSyntaxError("not in an element")
  1393. _write_cdata_string(self._c_out, (<CDATA>content)._utf8_data)
  1394. elif iselement(content):
  1395. if self._status > WRITER_IN_ELEMENT:
  1396. raise LxmlSyntaxError("cannot append trailing element to complete XML document")
  1397. _writeNodeToBuffer(self._c_out, (<_Element>content)._c_node,
  1398. self._c_encoding, NULL, c_method,
  1399. False, False, pretty_print, with_tail, False)
  1400. if (<_Element>content)._c_node.type == tree.XML_ELEMENT_NODE:
  1401. if not self._element_stack:
  1402. self._status = WRITER_FINISHED
  1403. elif content is not None:
  1404. raise TypeError(
  1405. f"got invalid input value of type {type(content)}, expected string, CDATA or Element")
  1406. self._handle_error(self._c_out.error)
  1407. if not self._buffered:
  1408. tree.xmlOutputBufferFlush(self._c_out)
  1409. self._handle_error(self._c_out.error)
  1410. def flush(self):
  1411. """flush(self)
  1412. Write any pending content of the current output buffer to the stream.
  1413. """
  1414. assert self._c_out is not NULL
  1415. tree.xmlOutputBufferFlush(self._c_out)
  1416. self._handle_error(self._c_out.error)
  1417. cdef _close(self, bint raise_on_error):
  1418. if raise_on_error:
  1419. if self._status < WRITER_IN_ELEMENT:
  1420. raise LxmlSyntaxError("no content written")
  1421. if self._element_stack:
  1422. raise LxmlSyntaxError("pending open tags on close")
  1423. error_result = self._c_out.error
  1424. if error_result == xmlerror.XML_ERR_OK:
  1425. error_result = tree.xmlOutputBufferClose(self._c_out)
  1426. if error_result != -1:
  1427. error_result = xmlerror.XML_ERR_OK
  1428. else:
  1429. tree.xmlOutputBufferClose(self._c_out)
  1430. self._status = WRITER_FINISHED
  1431. self._c_out = NULL
  1432. del self._element_stack[:]
  1433. if raise_on_error:
  1434. self._handle_error(error_result)
  1435. cdef _handle_error(self, int error_result):
  1436. if error_result != xmlerror.XML_ERR_OK:
  1437. if self._target is not None:
  1438. self._target._exc_context._raise_if_stored()
  1439. _raiseSerialisationError(error_result)
  1440. @cython.final
  1441. @cython.internal
  1442. cdef class _AsyncDataWriter:
  1443. cdef list _data
  1444. def __cinit__(self):
  1445. self._data = []
  1446. cdef bytes collect(self):
  1447. data = b''.join(self._data)
  1448. del self._data[:]
  1449. return data
  1450. def write(self, data):
  1451. self._data.append(data)
  1452. def close(self):
  1453. pass
  1454. @cython.final
  1455. @cython.internal
  1456. cdef class _AsyncIncrementalFileWriter:
  1457. cdef _IncrementalFileWriter _writer
  1458. cdef _AsyncDataWriter _buffer
  1459. cdef object _async_outfile
  1460. cdef int _flush_after_writes
  1461. cdef bint _should_close
  1462. cdef bint _buffered
  1463. def __cinit__(self, async_outfile, bytes encoding, int compresslevel, bint close,
  1464. bint buffered, int method):
  1465. self._flush_after_writes = 20
  1466. self._async_outfile = async_outfile
  1467. self._should_close = close
  1468. self._buffered = buffered
  1469. self._buffer = _AsyncDataWriter()
  1470. self._writer = _IncrementalFileWriter(
  1471. self._buffer, encoding, compresslevel, close=True, buffered=False, method=method)
  1472. cdef bytes _flush(self):
  1473. if not self._buffered or len(self._buffer._data) > self._flush_after_writes:
  1474. return self._buffer.collect()
  1475. return None
  1476. async def flush(self):
  1477. self._writer.flush()
  1478. data = self._buffer.collect()
  1479. if data:
  1480. await self._async_outfile.write(data)
  1481. async def write_declaration(self, version=None, standalone=None, doctype=None):
  1482. self._writer.write_declaration(version, standalone, doctype)
  1483. data = self._flush()
  1484. if data:
  1485. await self._async_outfile.write(data)
  1486. async def write_doctype(self, doctype):
  1487. self._writer.write_doctype(doctype)
  1488. data = self._flush()
  1489. if data:
  1490. await self._async_outfile.write(data)
  1491. async def write(self, *args, with_tail=True, pretty_print=False, method=None):
  1492. self._writer.write(*args, with_tail=with_tail, pretty_print=pretty_print, method=method)
  1493. data = self._flush()
  1494. if data:
  1495. await self._async_outfile.write(data)
  1496. def method(self, method):
  1497. return self._writer.method(method)
  1498. def element(self, tag, attrib=None, nsmap=None, method=None, **_extra):
  1499. element_writer = self._writer.element(tag, attrib, nsmap, method, **_extra)
  1500. return _AsyncFileWriterElement(element_writer, self)
  1501. async def _close(self, bint raise_on_error):
  1502. self._writer._close(raise_on_error)
  1503. data = self._buffer.collect()
  1504. if data:
  1505. await self._async_outfile.write(data)
  1506. if self._should_close:
  1507. await self._async_outfile.close()
  1508. @cython.final
  1509. @cython.internal
  1510. cdef class _AsyncFileWriterElement:
  1511. cdef _FileWriterElement _element_writer
  1512. cdef _AsyncIncrementalFileWriter _writer
  1513. def __cinit__(self, _FileWriterElement element_writer not None,
  1514. _AsyncIncrementalFileWriter writer not None):
  1515. self._element_writer = element_writer
  1516. self._writer = writer
  1517. async def __aenter__(self):
  1518. self._element_writer.__enter__()
  1519. data = self._writer._flush()
  1520. if data:
  1521. await self._writer._async_outfile.write(data)
  1522. async def __aexit__(self, *args):
  1523. self._element_writer.__exit__(*args)
  1524. data = self._writer._flush()
  1525. if data:
  1526. await self._writer._async_outfile.write(data)
  1527. @cython.final
  1528. @cython.internal
  1529. @cython.freelist(8)
  1530. cdef class _FileWriterElement:
  1531. cdef _IncrementalFileWriter _writer
  1532. cdef object _element
  1533. cdef int _new_method
  1534. cdef int _old_method
  1535. def __cinit__(self, _IncrementalFileWriter writer not None, element_config, int method):
  1536. self._writer = writer
  1537. self._element = element_config
  1538. self._new_method = method
  1539. self._old_method = writer._method
  1540. def __enter__(self):
  1541. self._writer._method = self._new_method
  1542. self._writer._write_start_element(self._element)
  1543. def __exit__(self, exc_type, exc_val, exc_tb):
  1544. self._writer._write_end_element(self._element)
  1545. self._writer._method = self._old_method
  1546. @cython.final
  1547. @cython.internal
  1548. @cython.freelist(8)
  1549. cdef class _MethodChanger:
  1550. cdef _IncrementalFileWriter _writer
  1551. cdef int _new_method
  1552. cdef int _old_method
  1553. cdef bint _entered
  1554. cdef bint _exited
  1555. def __cinit__(self, _IncrementalFileWriter writer not None, int method):
  1556. self._writer = writer
  1557. self._new_method = method
  1558. self._old_method = writer._method
  1559. self._entered = False
  1560. self._exited = False
  1561. def __enter__(self):
  1562. if self._entered:
  1563. raise LxmlSyntaxError("Inconsistent enter action in context manager")
  1564. self._writer._method = self._new_method
  1565. self._entered = True
  1566. def __exit__(self, exc_type, exc_val, exc_tb):
  1567. if self._exited:
  1568. raise LxmlSyntaxError("Inconsistent exit action in context manager")
  1569. if self._writer._method != self._new_method:
  1570. raise LxmlSyntaxError("Method changed outside of context manager")
  1571. self._writer._method = self._old_method
  1572. self._exited = True
  1573. async def __aenter__(self):
  1574. # for your async convenience
  1575. return self.__enter__()
  1576. async def __aexit__(self, *args):
  1577. # for your async convenience
  1578. return self.__exit__(*args)