You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

876 line
33 KiB

  1. # SAX-like interfaces
  2. class XMLSyntaxAssertionError(XMLSyntaxError, AssertionError):
  3. """
  4. An XMLSyntaxError that additionally inherits from AssertionError for
  5. ElementTree / backwards compatibility reasons.
  6. This class may get replaced by a plain XMLSyntaxError in a future version.
  7. """
  8. def __init__(self, message):
  9. XMLSyntaxError.__init__(self, message, None, 0, 1)
  10. ctypedef enum _SaxParserEvents:
  11. SAX_EVENT_START = 1 << 0
  12. SAX_EVENT_END = 1 << 1
  13. SAX_EVENT_DATA = 1 << 2
  14. SAX_EVENT_DOCTYPE = 1 << 3
  15. SAX_EVENT_PI = 1 << 4
  16. SAX_EVENT_COMMENT = 1 << 5
  17. SAX_EVENT_START_NS = 1 << 6
  18. SAX_EVENT_END_NS = 1 << 7
  19. ctypedef enum _ParseEventFilter:
  20. PARSE_EVENT_FILTER_START = 1 << 0
  21. PARSE_EVENT_FILTER_END = 1 << 1
  22. PARSE_EVENT_FILTER_START_NS = 1 << 2
  23. PARSE_EVENT_FILTER_END_NS = 1 << 3
  24. PARSE_EVENT_FILTER_COMMENT = 1 << 4
  25. PARSE_EVENT_FILTER_PI = 1 << 5
  26. cdef int _buildParseEventFilter(events) except -1:
  27. cdef int event_filter = 0
  28. for event in events:
  29. if event == 'start':
  30. event_filter |= PARSE_EVENT_FILTER_START
  31. elif event == 'end':
  32. event_filter |= PARSE_EVENT_FILTER_END
  33. elif event == 'start-ns':
  34. event_filter |= PARSE_EVENT_FILTER_START_NS
  35. elif event == 'end-ns':
  36. event_filter |= PARSE_EVENT_FILTER_END_NS
  37. elif event == 'comment':
  38. event_filter |= PARSE_EVENT_FILTER_COMMENT
  39. elif event == 'pi':
  40. event_filter |= PARSE_EVENT_FILTER_PI
  41. else:
  42. raise ValueError, f"invalid event name '{event}'"
  43. return event_filter
  44. cdef class _SaxParserTarget:
  45. cdef int _sax_event_filter
  46. cdef _handleSaxStart(self, tag, attrib, nsmap):
  47. return None
  48. cdef _handleSaxEnd(self, tag):
  49. return None
  50. cdef int _handleSaxData(self, data) except -1:
  51. return 0
  52. cdef int _handleSaxDoctype(self, root_tag, public_id, system_id) except -1:
  53. return 0
  54. cdef _handleSaxPi(self, target, data):
  55. return None
  56. cdef _handleSaxComment(self, comment):
  57. return None
  58. cdef _handleSaxStartNs(self, prefix, uri):
  59. return None
  60. cdef _handleSaxEndNs(self, prefix):
  61. return None
  62. #@cython.final
  63. @cython.internal
  64. @cython.no_gc_clear # Required because parent class uses it - Cython bug.
  65. cdef class _SaxParserContext(_ParserContext):
  66. """This class maps SAX2 events to parser target events.
  67. """
  68. cdef _SaxParserTarget _target
  69. cdef _BaseParser _parser
  70. cdef xmlparser.startElementNsSAX2Func _origSaxStart
  71. cdef xmlparser.endElementNsSAX2Func _origSaxEnd
  72. cdef xmlparser.startElementSAXFunc _origSaxStartNoNs
  73. cdef xmlparser.endElementSAXFunc _origSaxEndNoNs
  74. cdef xmlparser.charactersSAXFunc _origSaxData
  75. cdef xmlparser.cdataBlockSAXFunc _origSaxCData
  76. cdef xmlparser.internalSubsetSAXFunc _origSaxDoctype
  77. cdef xmlparser.commentSAXFunc _origSaxComment
  78. cdef xmlparser.processingInstructionSAXFunc _origSaxPI
  79. cdef xmlparser.startDocumentSAXFunc _origSaxStartDocument
  80. # for event collecting
  81. cdef int _event_filter
  82. cdef list _ns_stack
  83. cdef list _node_stack
  84. cdef _ParseEventsIterator events_iterator
  85. # for iterparse
  86. cdef _Element _root
  87. cdef _MultiTagMatcher _matcher
  88. def __cinit__(self, _BaseParser parser):
  89. self._ns_stack = []
  90. self._node_stack = []
  91. self._parser = parser
  92. self.events_iterator = _ParseEventsIterator()
  93. cdef void _setSaxParserTarget(self, _SaxParserTarget target) noexcept:
  94. self._target = target
  95. cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept:
  96. _ParserContext._initParserContext(self, c_ctxt)
  97. if self._target is not None:
  98. self._connectTarget(c_ctxt)
  99. elif self._event_filter:
  100. self._connectEvents(c_ctxt)
  101. cdef void _connectTarget(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept:
  102. """Wrap original SAX2 callbacks to call into parser target.
  103. """
  104. sax = c_ctxt.sax
  105. self._origSaxStart = sax.startElementNs = NULL
  106. self._origSaxStartNoNs = sax.startElement = NULL
  107. if self._target._sax_event_filter & (SAX_EVENT_START |
  108. SAX_EVENT_START_NS |
  109. SAX_EVENT_END_NS):
  110. # intercept => overwrite orig callback
  111. # FIXME: also intercept on when collecting END events
  112. if sax.initialized == xmlparser.XML_SAX2_MAGIC:
  113. sax.startElementNs = _handleSaxTargetStart
  114. if self._target._sax_event_filter & SAX_EVENT_START:
  115. sax.startElement = _handleSaxTargetStartNoNs
  116. self._origSaxEnd = sax.endElementNs = NULL
  117. self._origSaxEndNoNs = sax.endElement = NULL
  118. if self._target._sax_event_filter & (SAX_EVENT_END |
  119. SAX_EVENT_END_NS):
  120. if sax.initialized == xmlparser.XML_SAX2_MAGIC:
  121. sax.endElementNs = _handleSaxEnd
  122. if self._target._sax_event_filter & SAX_EVENT_END:
  123. sax.endElement = _handleSaxEndNoNs
  124. self._origSaxData = sax.characters = sax.cdataBlock = NULL
  125. if self._target._sax_event_filter & SAX_EVENT_DATA:
  126. sax.characters = sax.cdataBlock = _handleSaxData
  127. # doctype propagation is always required for entity replacement
  128. self._origSaxDoctype = sax.internalSubset
  129. if self._target._sax_event_filter & SAX_EVENT_DOCTYPE:
  130. sax.internalSubset = _handleSaxTargetDoctype
  131. self._origSaxPI = sax.processingInstruction = NULL
  132. if self._target._sax_event_filter & SAX_EVENT_PI:
  133. sax.processingInstruction = _handleSaxTargetPI
  134. self._origSaxComment = sax.comment = NULL
  135. if self._target._sax_event_filter & SAX_EVENT_COMMENT:
  136. sax.comment = _handleSaxTargetComment
  137. # enforce entity replacement
  138. sax.reference = NULL
  139. c_ctxt.replaceEntities = 1
  140. cdef void _connectEvents(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept:
  141. """Wrap original SAX2 callbacks to collect parse events without parser target.
  142. """
  143. sax = c_ctxt.sax
  144. self._origSaxStartDocument = sax.startDocument
  145. sax.startDocument = _handleSaxStartDocument
  146. # only override "start" event handler if needed
  147. self._origSaxStart = sax.startElementNs
  148. if self._event_filter == 0 or c_ctxt.html or \
  149. self._event_filter & (PARSE_EVENT_FILTER_START |
  150. PARSE_EVENT_FILTER_END |
  151. PARSE_EVENT_FILTER_START_NS |
  152. PARSE_EVENT_FILTER_END_NS):
  153. sax.startElementNs = <xmlparser.startElementNsSAX2Func>_handleSaxStart
  154. self._origSaxStartNoNs = sax.startElement
  155. if self._event_filter == 0 or c_ctxt.html or \
  156. self._event_filter & (PARSE_EVENT_FILTER_START |
  157. PARSE_EVENT_FILTER_END):
  158. sax.startElement = <xmlparser.startElementSAXFunc>_handleSaxStartNoNs
  159. # only override "end" event handler if needed
  160. self._origSaxEnd = sax.endElementNs
  161. if self._event_filter == 0 or \
  162. self._event_filter & (PARSE_EVENT_FILTER_END |
  163. PARSE_EVENT_FILTER_END_NS):
  164. sax.endElementNs = <xmlparser.endElementNsSAX2Func>_handleSaxEnd
  165. self._origSaxEndNoNs = sax.endElement
  166. if self._event_filter == 0 or \
  167. self._event_filter & PARSE_EVENT_FILTER_END:
  168. sax.endElement = <xmlparser.endElementSAXFunc>_handleSaxEndNoNs
  169. self._origSaxComment = sax.comment
  170. if self._event_filter & PARSE_EVENT_FILTER_COMMENT:
  171. sax.comment = <xmlparser.commentSAXFunc>_handleSaxComment
  172. self._origSaxPI = sax.processingInstruction
  173. if self._event_filter & PARSE_EVENT_FILTER_PI:
  174. sax.processingInstruction = <xmlparser.processingInstructionSAXFunc>_handleSaxPIEvent
  175. cdef _setEventFilter(self, events, tag):
  176. self._event_filter = _buildParseEventFilter(events)
  177. if not self._event_filter or tag is None or tag == '*':
  178. self._matcher = None
  179. else:
  180. self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag)
  181. cdef int startDocument(self, xmlDoc* c_doc) except -1:
  182. try:
  183. self._doc = _documentFactory(c_doc, self._parser)
  184. finally:
  185. self._parser = None # clear circular reference ASAP
  186. if self._matcher is not None:
  187. self._matcher.cacheTags(self._doc, force_into_dict=True)
  188. return 0
  189. cdef int pushEvent(self, event, xmlNode* c_node) except -1:
  190. cdef _Element root
  191. if self._root is None:
  192. root = self._doc.getroot()
  193. if root is not None and root._c_node.type == tree.XML_ELEMENT_NODE:
  194. self._root = root
  195. node = _elementFactory(self._doc, c_node)
  196. self.events_iterator._events.append( (event, node) )
  197. return 0
  198. cdef int flushEvents(self) except -1:
  199. events = self.events_iterator._events
  200. while self._node_stack:
  201. events.append( ('end', self._node_stack.pop()) )
  202. _pushSaxNsEndEvents(self)
  203. while self._ns_stack:
  204. _pushSaxNsEndEvents(self)
  205. cdef void _handleSaxException(self, xmlparser.xmlParserCtxt* c_ctxt) noexcept:
  206. if c_ctxt.errNo == xmlerror.XML_ERR_OK:
  207. c_ctxt.errNo = xmlerror.XML_ERR_INTERNAL_ERROR
  208. # stop parsing immediately
  209. c_ctxt.wellFormed = 0
  210. c_ctxt.disableSAX = 1
  211. c_ctxt.instate = xmlparser.XML_PARSER_EOF
  212. self._store_raised()
  213. @cython.final
  214. @cython.internal
  215. cdef class _ParseEventsIterator:
  216. """A reusable parse events iterator"""
  217. cdef list _events
  218. cdef int _event_index
  219. def __cinit__(self):
  220. self._events = []
  221. self._event_index = 0
  222. def __iter__(self):
  223. return self
  224. def __next__(self):
  225. cdef int event_index = self._event_index
  226. events = self._events
  227. if event_index >= 2**10 or event_index * 2 >= len(events):
  228. if event_index:
  229. # clean up from time to time
  230. del events[:event_index]
  231. self._event_index = event_index = 0
  232. if event_index >= len(events):
  233. raise StopIteration
  234. item = events[event_index]
  235. self._event_index = event_index + 1
  236. return item
  237. cdef list _build_prefix_uri_list(_SaxParserContext context, int c_nb_namespaces,
  238. const_xmlChar** c_namespaces):
  239. "Build [(prefix, uri)] list of declared namespaces."
  240. cdef int i
  241. namespaces = []
  242. for i in xrange(c_nb_namespaces):
  243. namespaces.append((funicodeOrEmpty(c_namespaces[0]), funicode(c_namespaces[1])))
  244. c_namespaces += 2
  245. return namespaces
  246. cdef void _handleSaxStart(
  247. void* ctxt, const_xmlChar* c_localname, const_xmlChar* c_prefix,
  248. const_xmlChar* c_namespace, int c_nb_namespaces,
  249. const_xmlChar** c_namespaces,
  250. int c_nb_attributes, int c_nb_defaulted,
  251. const_xmlChar** c_attributes) noexcept with gil:
  252. cdef int i
  253. cdef size_t c_len
  254. c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
  255. if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
  256. return
  257. context = <_SaxParserContext>c_ctxt._private
  258. cdef int event_filter = context._event_filter
  259. try:
  260. if (c_nb_namespaces and
  261. event_filter & (PARSE_EVENT_FILTER_START_NS |
  262. PARSE_EVENT_FILTER_END_NS)):
  263. declared_namespaces = _build_prefix_uri_list(
  264. context, c_nb_namespaces, c_namespaces)
  265. if event_filter & PARSE_EVENT_FILTER_START_NS:
  266. for prefix_uri_tuple in declared_namespaces:
  267. context.events_iterator._events.append(("start-ns", prefix_uri_tuple))
  268. else:
  269. declared_namespaces = None
  270. context._origSaxStart(c_ctxt, c_localname, c_prefix, c_namespace,
  271. c_nb_namespaces, c_namespaces, c_nb_attributes,
  272. c_nb_defaulted, c_attributes)
  273. if c_ctxt.html:
  274. _fixHtmlDictNodeNames(c_ctxt.dict, c_ctxt.node)
  275. # The HTML parser in libxml2 reports the missing opening tags when it finds
  276. # misplaced ones, but with tag names from C string constants that ignore the
  277. # parser dict. Thus, we need to intern the name ourselves.
  278. c_localname = tree.xmlDictLookup(c_ctxt.dict, c_localname, -1)
  279. if c_localname is NULL:
  280. raise MemoryError()
  281. if event_filter & PARSE_EVENT_FILTER_END_NS:
  282. context._ns_stack.append(declared_namespaces)
  283. if event_filter & (PARSE_EVENT_FILTER_END |
  284. PARSE_EVENT_FILTER_START):
  285. _pushSaxStartEvent(context, c_ctxt, c_namespace, c_localname, None)
  286. except:
  287. context._handleSaxException(c_ctxt)
  288. finally:
  289. return # swallow any further exceptions
  290. cdef void _handleSaxTargetStart(
  291. void* ctxt, const_xmlChar* c_localname, const_xmlChar* c_prefix,
  292. const_xmlChar* c_namespace, int c_nb_namespaces,
  293. const_xmlChar** c_namespaces,
  294. int c_nb_attributes, int c_nb_defaulted,
  295. const_xmlChar** c_attributes) noexcept with gil:
  296. cdef int i
  297. cdef size_t c_len
  298. c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
  299. if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
  300. return
  301. context = <_SaxParserContext>c_ctxt._private
  302. cdef int event_filter = context._event_filter
  303. cdef int sax_event_filter = context._target._sax_event_filter
  304. try:
  305. if c_nb_namespaces:
  306. declared_namespaces = _build_prefix_uri_list(
  307. context, c_nb_namespaces, c_namespaces)
  308. if event_filter & PARSE_EVENT_FILTER_START_NS:
  309. for prefix_uri_tuple in declared_namespaces:
  310. context.events_iterator._events.append(("start-ns", prefix_uri_tuple))
  311. if sax_event_filter & SAX_EVENT_START_NS:
  312. for prefix, uri in declared_namespaces:
  313. context._target._handleSaxStartNs(prefix, uri)
  314. else:
  315. declared_namespaces = None
  316. if sax_event_filter & SAX_EVENT_START:
  317. if c_nb_defaulted > 0:
  318. # only add default attributes if we asked for them
  319. if c_ctxt.loadsubset & xmlparser.XML_COMPLETE_ATTRS == 0:
  320. c_nb_attributes -= c_nb_defaulted
  321. if c_nb_attributes == 0:
  322. attrib = IMMUTABLE_EMPTY_MAPPING
  323. else:
  324. attrib = {}
  325. for i in xrange(c_nb_attributes):
  326. name = _namespacedNameFromNsName(
  327. c_attributes[2], c_attributes[0])
  328. if c_attributes[3] is NULL:
  329. value = ''
  330. else:
  331. c_len = c_attributes[4] - c_attributes[3]
  332. value = c_attributes[3][:c_len].decode('utf8')
  333. attrib[name] = value
  334. c_attributes += 5
  335. nsmap = dict(declared_namespaces) if c_nb_namespaces else IMMUTABLE_EMPTY_MAPPING
  336. element = _callTargetSaxStart(
  337. context, c_ctxt,
  338. _namespacedNameFromNsName(c_namespace, c_localname),
  339. attrib, nsmap)
  340. else:
  341. element = None
  342. if (event_filter & PARSE_EVENT_FILTER_END_NS or
  343. sax_event_filter & SAX_EVENT_END_NS):
  344. context._ns_stack.append(declared_namespaces)
  345. if event_filter & (PARSE_EVENT_FILTER_END |
  346. PARSE_EVENT_FILTER_START):
  347. _pushSaxStartEvent(context, c_ctxt, c_namespace,
  348. c_localname, element)
  349. except:
  350. context._handleSaxException(c_ctxt)
  351. finally:
  352. return # swallow any further exceptions
  353. cdef void _handleSaxStartNoNs(void* ctxt, const_xmlChar* c_name,
  354. const_xmlChar** c_attributes) noexcept with gil:
  355. c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
  356. if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
  357. return
  358. context = <_SaxParserContext>c_ctxt._private
  359. try:
  360. context._origSaxStartNoNs(c_ctxt, c_name, c_attributes)
  361. if c_ctxt.html:
  362. _fixHtmlDictNodeNames(c_ctxt.dict, c_ctxt.node)
  363. # The HTML parser in libxml2 reports the missing opening tags when it finds
  364. # misplaced ones, but with tag names from C string constants that ignore the
  365. # parser dict. Thus, we need to intern the name ourselves.
  366. c_name = tree.xmlDictLookup(c_ctxt.dict, c_name, -1)
  367. if c_name is NULL:
  368. raise MemoryError()
  369. if context._event_filter & (PARSE_EVENT_FILTER_END |
  370. PARSE_EVENT_FILTER_START):
  371. _pushSaxStartEvent(context, c_ctxt, NULL, c_name, None)
  372. except:
  373. context._handleSaxException(c_ctxt)
  374. finally:
  375. return # swallow any further exceptions
  376. cdef void _handleSaxTargetStartNoNs(void* ctxt, const_xmlChar* c_name,
  377. const_xmlChar** c_attributes) noexcept with gil:
  378. c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
  379. if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
  380. return
  381. context = <_SaxParserContext>c_ctxt._private
  382. try:
  383. if c_attributes is NULL:
  384. attrib = IMMUTABLE_EMPTY_MAPPING
  385. else:
  386. attrib = {}
  387. while c_attributes[0] is not NULL:
  388. name = funicode(c_attributes[0])
  389. attrib[name] = funicodeOrEmpty(c_attributes[1])
  390. c_attributes += 2
  391. element = _callTargetSaxStart(
  392. context, c_ctxt, funicode(c_name),
  393. attrib, IMMUTABLE_EMPTY_MAPPING)
  394. if context._event_filter & (PARSE_EVENT_FILTER_END |
  395. PARSE_EVENT_FILTER_START):
  396. _pushSaxStartEvent(context, c_ctxt, NULL, c_name, element)
  397. except:
  398. context._handleSaxException(c_ctxt)
  399. finally:
  400. return # swallow any further exceptions
  401. cdef _callTargetSaxStart(_SaxParserContext context,
  402. xmlparser.xmlParserCtxt* c_ctxt,
  403. tag, attrib, nsmap):
  404. element = context._target._handleSaxStart(tag, attrib, nsmap)
  405. if element is not None and c_ctxt.input is not NULL:
  406. if isinstance(element, _Element):
  407. (<_Element>element)._c_node.line = (
  408. <unsigned short>c_ctxt.input.line
  409. if c_ctxt.input.line < 65535 else 65535)
  410. return element
  411. cdef int _pushSaxStartEvent(_SaxParserContext context,
  412. xmlparser.xmlParserCtxt* c_ctxt,
  413. const_xmlChar* c_href,
  414. const_xmlChar* c_name, node) except -1:
  415. if (context._matcher is None or
  416. context._matcher.matchesNsTag(c_href, c_name)):
  417. if node is None and context._target is None:
  418. assert context._doc is not None
  419. node = _elementFactory(context._doc, c_ctxt.node)
  420. if context._event_filter & PARSE_EVENT_FILTER_START:
  421. context.events_iterator._events.append(('start', node))
  422. if (context._target is None and
  423. context._event_filter & PARSE_EVENT_FILTER_END):
  424. context._node_stack.append(node)
  425. return 0
  426. cdef void _handleSaxEnd(void* ctxt, const_xmlChar* c_localname,
  427. const_xmlChar* c_prefix,
  428. const_xmlChar* c_namespace) noexcept with gil:
  429. c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
  430. if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
  431. return
  432. context = <_SaxParserContext>c_ctxt._private
  433. try:
  434. if context._target is not None:
  435. if context._target._sax_event_filter & SAX_EVENT_END:
  436. node = context._target._handleSaxEnd(
  437. _namespacedNameFromNsName(c_namespace, c_localname))
  438. else:
  439. node = None
  440. else:
  441. context._origSaxEnd(c_ctxt, c_localname, c_prefix, c_namespace)
  442. node = None
  443. _pushSaxEndEvent(context, c_namespace, c_localname, node)
  444. _pushSaxNsEndEvents(context)
  445. except:
  446. context._handleSaxException(c_ctxt)
  447. finally:
  448. return # swallow any further exceptions
  449. cdef void _handleSaxEndNoNs(void* ctxt, const_xmlChar* c_name) noexcept with gil:
  450. c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
  451. if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
  452. return
  453. context = <_SaxParserContext>c_ctxt._private
  454. try:
  455. if context._target is not None:
  456. node = context._target._handleSaxEnd(funicode(c_name))
  457. else:
  458. context._origSaxEndNoNs(c_ctxt, c_name)
  459. node = None
  460. _pushSaxEndEvent(context, NULL, c_name, node)
  461. except:
  462. context._handleSaxException(c_ctxt)
  463. finally:
  464. return # swallow any further exceptions
  465. cdef int _pushSaxNsEndEvents(_SaxParserContext context) except -1:
  466. cdef bint build_events = context._event_filter & PARSE_EVENT_FILTER_END_NS
  467. cdef bint call_target = (
  468. context._target is not None
  469. and context._target._sax_event_filter & SAX_EVENT_END_NS)
  470. if not build_events and not call_target:
  471. return 0
  472. cdef list declared_namespaces = context._ns_stack.pop()
  473. if declared_namespaces is None:
  474. return 0
  475. cdef tuple prefix_uri
  476. for prefix_uri in reversed(declared_namespaces):
  477. if call_target:
  478. context._target._handleSaxEndNs(prefix_uri[0])
  479. if build_events:
  480. context.events_iterator._events.append(('end-ns', None))
  481. return 0
  482. cdef int _pushSaxEndEvent(_SaxParserContext context,
  483. const_xmlChar* c_href,
  484. const_xmlChar* c_name, node) except -1:
  485. if context._event_filter & PARSE_EVENT_FILTER_END:
  486. if (context._matcher is None or
  487. context._matcher.matchesNsTag(c_href, c_name)):
  488. if context._target is None:
  489. node = context._node_stack.pop()
  490. context.events_iterator._events.append(('end', node))
  491. return 0
  492. cdef void _handleSaxData(void* ctxt, const_xmlChar* c_data, int data_len) noexcept with gil:
  493. # can only be called if parsing with a target
  494. c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
  495. if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
  496. return
  497. context = <_SaxParserContext>c_ctxt._private
  498. try:
  499. context._target._handleSaxData(
  500. c_data[:data_len].decode('utf8'))
  501. except:
  502. context._handleSaxException(c_ctxt)
  503. finally:
  504. return # swallow any further exceptions
  505. cdef void _handleSaxTargetDoctype(void* ctxt, const_xmlChar* c_name,
  506. const_xmlChar* c_public,
  507. const_xmlChar* c_system) noexcept with gil:
  508. # can only be called if parsing with a target
  509. c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
  510. if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
  511. return
  512. context = <_SaxParserContext>c_ctxt._private
  513. try:
  514. context._target._handleSaxDoctype(
  515. funicodeOrNone(c_name),
  516. funicodeOrNone(c_public),
  517. funicodeOrNone(c_system))
  518. except:
  519. context._handleSaxException(c_ctxt)
  520. finally:
  521. return # swallow any further exceptions
  522. cdef void _handleSaxStartDocument(void* ctxt) noexcept with gil:
  523. c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
  524. if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
  525. return
  526. context = <_SaxParserContext>c_ctxt._private
  527. context._origSaxStartDocument(ctxt)
  528. c_doc = c_ctxt.myDoc
  529. try:
  530. context.startDocument(c_doc)
  531. except:
  532. context._handleSaxException(c_ctxt)
  533. finally:
  534. return # swallow any further exceptions
  535. cdef void _handleSaxTargetPI(void* ctxt, const_xmlChar* c_target,
  536. const_xmlChar* c_data) noexcept with gil:
  537. # can only be called if parsing with a target
  538. c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
  539. if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
  540. return
  541. context = <_SaxParserContext>c_ctxt._private
  542. try:
  543. pi = context._target._handleSaxPi(
  544. funicodeOrNone(c_target),
  545. funicodeOrEmpty(c_data))
  546. if context._event_filter & PARSE_EVENT_FILTER_PI:
  547. context.events_iterator._events.append(('pi', pi))
  548. except:
  549. context._handleSaxException(c_ctxt)
  550. finally:
  551. return # swallow any further exceptions
  552. cdef void _handleSaxPIEvent(void* ctxt, const_xmlChar* target,
  553. const_xmlChar* data) noexcept with gil:
  554. # can only be called when collecting pi events
  555. c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
  556. if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
  557. return
  558. context = <_SaxParserContext>c_ctxt._private
  559. context._origSaxPI(ctxt, target, data)
  560. c_node = _findLastEventNode(c_ctxt)
  561. if c_node is NULL:
  562. return
  563. try:
  564. context.pushEvent('pi', c_node)
  565. except:
  566. context._handleSaxException(c_ctxt)
  567. finally:
  568. return # swallow any further exceptions
  569. cdef void _handleSaxTargetComment(void* ctxt, const_xmlChar* c_data) noexcept with gil:
  570. # can only be called if parsing with a target
  571. c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
  572. if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
  573. return
  574. context = <_SaxParserContext>c_ctxt._private
  575. try:
  576. comment = context._target._handleSaxComment(funicodeOrEmpty(c_data))
  577. if context._event_filter & PARSE_EVENT_FILTER_COMMENT:
  578. context.events_iterator._events.append(('comment', comment))
  579. except:
  580. context._handleSaxException(c_ctxt)
  581. finally:
  582. return # swallow any further exceptions
  583. cdef void _handleSaxComment(void* ctxt, const_xmlChar* text) noexcept with gil:
  584. # can only be called when collecting comment events
  585. c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
  586. if c_ctxt._private is NULL or xmlparser.xmlCtxtIsStopped(c_ctxt):
  587. return
  588. context = <_SaxParserContext>c_ctxt._private
  589. context._origSaxComment(ctxt, text)
  590. c_node = _findLastEventNode(c_ctxt)
  591. if c_node is NULL:
  592. return
  593. try:
  594. context.pushEvent('comment', c_node)
  595. except:
  596. context._handleSaxException(c_ctxt)
  597. finally:
  598. return # swallow any further exceptions
  599. cdef inline xmlNode* _findLastEventNode(xmlparser.xmlParserCtxt* c_ctxt):
  600. # this mimics what libxml2 creates for comments/PIs
  601. if c_ctxt.inSubset == 1:
  602. return c_ctxt.myDoc.intSubset.last
  603. elif c_ctxt.inSubset == 2:
  604. return c_ctxt.myDoc.extSubset.last
  605. elif c_ctxt.node is NULL:
  606. return c_ctxt.myDoc.last
  607. elif c_ctxt.node.type == tree.XML_ELEMENT_NODE:
  608. return c_ctxt.node.last
  609. else:
  610. return c_ctxt.node.next
  611. ############################################################
  612. ## ET compatible XML tree builder
  613. ############################################################
  614. cdef class TreeBuilder(_SaxParserTarget):
  615. """TreeBuilder(self, element_factory=None, parser=None,
  616. comment_factory=None, pi_factory=None,
  617. insert_comments=True, insert_pis=True)
  618. Parser target that builds a tree from parse event callbacks.
  619. The factory arguments can be used to influence the creation of
  620. elements, comments and processing instructions.
  621. By default, comments and processing instructions are inserted into
  622. the tree, but they can be ignored by passing the respective flags.
  623. The final tree is returned by the ``close()`` method.
  624. """
  625. cdef _BaseParser _parser
  626. cdef object _factory
  627. cdef object _comment_factory
  628. cdef object _pi_factory
  629. cdef list _data
  630. cdef list _element_stack
  631. cdef object _element_stack_pop
  632. cdef _Element _last # may be None
  633. cdef bint _in_tail
  634. cdef bint _insert_comments
  635. cdef bint _insert_pis
  636. def __init__(self, *, element_factory=None, parser=None,
  637. comment_factory=None, pi_factory=None,
  638. bint insert_comments=True, bint insert_pis=True):
  639. self._sax_event_filter = \
  640. SAX_EVENT_START | SAX_EVENT_END | SAX_EVENT_DATA | \
  641. SAX_EVENT_PI | SAX_EVENT_COMMENT
  642. self._data = [] # data collector
  643. self._element_stack = [] # element stack
  644. self._element_stack_pop = self._element_stack.pop
  645. self._last = None # last element
  646. self._in_tail = 0 # true if we're after an end tag
  647. self._factory = element_factory
  648. self._comment_factory = comment_factory if comment_factory is not None else Comment
  649. self._pi_factory = pi_factory if pi_factory is not None else ProcessingInstruction
  650. self._insert_comments = insert_comments
  651. self._insert_pis = insert_pis
  652. self._parser = parser
  653. @cython.final
  654. cdef int _flush(self) except -1:
  655. if self._data:
  656. if self._last is not None:
  657. text = "".join(self._data)
  658. if self._in_tail:
  659. assert self._last.tail is None, "internal error (tail)"
  660. self._last.tail = text
  661. else:
  662. assert self._last.text is None, "internal error (text)"
  663. self._last.text = text
  664. del self._data[:]
  665. return 0
  666. # internal SAX event handlers
  667. @cython.final
  668. cdef _handleSaxStart(self, tag, attrib, nsmap):
  669. self._flush()
  670. if self._factory is not None:
  671. self._last = self._factory(tag, attrib)
  672. if self._element_stack:
  673. _appendChild(self._element_stack[-1], self._last)
  674. elif self._element_stack:
  675. self._last = _makeSubElement(
  676. self._element_stack[-1], tag, None, None, attrib, nsmap, None)
  677. else:
  678. self._last = _makeElement(
  679. tag, NULL, None, self._parser, None, None, attrib, nsmap, None)
  680. self._element_stack.append(self._last)
  681. self._in_tail = 0
  682. return self._last
  683. @cython.final
  684. cdef _handleSaxEnd(self, tag):
  685. self._flush()
  686. self._last = self._element_stack_pop()
  687. self._in_tail = 1
  688. return self._last
  689. @cython.final
  690. cdef int _handleSaxData(self, data) except -1:
  691. self._data.append(data)
  692. @cython.final
  693. cdef _handleSaxPi(self, target, data):
  694. elem = self._pi_factory(target, data)
  695. if self._insert_pis:
  696. self._flush()
  697. self._last = elem
  698. if self._element_stack:
  699. _appendChild(self._element_stack[-1], self._last)
  700. self._in_tail = 1
  701. return self._last
  702. @cython.final
  703. cdef _handleSaxComment(self, comment):
  704. elem = self._comment_factory(comment)
  705. if self._insert_comments:
  706. self._flush()
  707. self._last = elem
  708. if self._element_stack:
  709. _appendChild(self._element_stack[-1], self._last)
  710. self._in_tail = 1
  711. return elem
  712. # Python level event handlers
  713. def close(self):
  714. """close(self)
  715. Flushes the builder buffers, and returns the toplevel document
  716. element. Raises XMLSyntaxError on inconsistencies.
  717. """
  718. if self._element_stack:
  719. raise XMLSyntaxAssertionError("missing end tags")
  720. # TODO: this does not necessarily seem like an error case. Why not just return None?
  721. if self._last is None:
  722. raise XMLSyntaxAssertionError("missing toplevel element")
  723. return self._last
  724. def data(self, data):
  725. """data(self, data)
  726. Adds text to the current element. The value should be either an
  727. 8-bit string containing ASCII text, or a Unicode string.
  728. """
  729. self._handleSaxData(data)
  730. def start(self, tag, attrs, nsmap=None):
  731. """start(self, tag, attrs, nsmap=None)
  732. Opens a new element.
  733. """
  734. if nsmap is None:
  735. nsmap = IMMUTABLE_EMPTY_MAPPING
  736. return self._handleSaxStart(tag, attrs, nsmap)
  737. def end(self, tag):
  738. """end(self, tag)
  739. Closes the current element.
  740. """
  741. element = self._handleSaxEnd(tag)
  742. assert self._last.tag == tag,\
  743. f"end tag mismatch (expected {self._last.tag}, got {tag})"
  744. return element
  745. def pi(self, target, data=None):
  746. """pi(self, target, data=None)
  747. Creates a processing instruction using the factory, appends it
  748. (unless disabled) and returns it.
  749. """
  750. return self._handleSaxPi(target, data)
  751. def comment(self, comment):
  752. """comment(self, comment)
  753. Creates a comment using the factory, appends it (unless disabled)
  754. and returns it.
  755. """
  756. return self._handleSaxComment(comment)