You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

439 lines
16 KiB

  1. # iterparse -- event-driven parsing
  2. DEF __ITERPARSE_CHUNK_SIZE = 32768
  3. cdef class iterparse:
  4. """iterparse(self, source, events=("end",), tag=None, \
  5. attribute_defaults=False, dtd_validation=False, \
  6. load_dtd=False, no_network=True, remove_blank_text=False, \
  7. remove_comments=False, remove_pis=False, encoding=None, \
  8. html=False, recover=None, huge_tree=False, schema=None)
  9. Incremental parser.
  10. Parses XML into a tree and generates tuples (event, element) in a
  11. SAX-like fashion. ``event`` is any of 'start', 'end', 'start-ns',
  12. 'end-ns'.
  13. For 'start' and 'end', ``element`` is the Element that the parser just
  14. found opening or closing. For 'start-ns', it is a tuple (prefix, URI) of
  15. a new namespace declaration. For 'end-ns', it is simply None. Note that
  16. all start and end events are guaranteed to be properly nested.
  17. The keyword argument ``events`` specifies a sequence of event type names
  18. that should be generated. By default, only 'end' events will be
  19. generated.
  20. The additional ``tag`` argument restricts the 'start' and 'end' events to
  21. those elements that match the given tag. The ``tag`` argument can also be
  22. a sequence of tags to allow matching more than one tag. By default,
  23. events are generated for all elements. Note that the 'start-ns' and
  24. 'end-ns' events are not impacted by this restriction.
  25. The other keyword arguments in the constructor are mainly based on the
  26. libxml2 parser configuration. A DTD will also be loaded if validation or
  27. attribute default values are requested.
  28. Available boolean keyword arguments:
  29. - attribute_defaults: read default attributes from DTD
  30. - dtd_validation: validate (if DTD is available)
  31. - load_dtd: use DTD for parsing
  32. - no_network: prevent network access for related files
  33. - remove_blank_text: discard blank text nodes
  34. - remove_comments: discard comments
  35. - remove_pis: discard processing instructions
  36. - strip_cdata: replace CDATA sections by normal text content (default:
  37. True for XML, ignored otherwise)
  38. - compact: safe memory for short text content (default: True)
  39. - resolve_entities: replace entities by their text value (default: True)
  40. - huge_tree: disable security restrictions and support very deep trees
  41. and very long text content (only affects libxml2 2.7+)
  42. - html: parse input as HTML (default: XML)
  43. - recover: try hard to parse through broken input (default: True for HTML,
  44. False otherwise)
  45. Other keyword arguments:
  46. - encoding: override the document encoding
  47. - schema: an XMLSchema to validate against
  48. """
  49. cdef _FeedParser _parser
  50. cdef object _tag
  51. cdef object _events
  52. cdef readonly object root
  53. cdef object _source
  54. cdef object _filename
  55. cdef object _error
  56. cdef bint _close_source_after_read
  57. def __init__(self, source, events=("end",), *, tag=None,
  58. attribute_defaults=False, dtd_validation=False,
  59. load_dtd=False, no_network=True, remove_blank_text=False,
  60. compact=True, resolve_entities=True, remove_comments=False,
  61. remove_pis=False, strip_cdata=True, encoding=None,
  62. html=False, recover=None, huge_tree=False, collect_ids=True,
  63. XMLSchema schema=None):
  64. if not hasattr(source, 'read'):
  65. source = _getFSPathOrObject(source)
  66. self._filename = source
  67. self._source = open(source, 'rb')
  68. self._close_source_after_read = True
  69. else:
  70. self._filename = _getFilenameForFile(source)
  71. self._source = source
  72. self._close_source_after_read = False
  73. if recover is None:
  74. recover = html
  75. if html:
  76. # make sure we're not looking for namespaces
  77. events = [event for event in events
  78. if event not in ('start-ns', 'end-ns')]
  79. parser = HTMLPullParser(
  80. events,
  81. tag=tag,
  82. recover=recover,
  83. base_url=self._filename,
  84. encoding=encoding,
  85. remove_blank_text=remove_blank_text,
  86. remove_comments=remove_comments,
  87. remove_pis=remove_pis,
  88. no_network=no_network,
  89. target=None, # TODO
  90. schema=schema,
  91. compact=compact)
  92. else:
  93. parser = XMLPullParser(
  94. events,
  95. tag=tag,
  96. recover=recover,
  97. base_url=self._filename,
  98. encoding=encoding,
  99. attribute_defaults=attribute_defaults,
  100. dtd_validation=dtd_validation,
  101. load_dtd=load_dtd,
  102. no_network=no_network,
  103. schema=schema,
  104. huge_tree=huge_tree,
  105. remove_blank_text=remove_blank_text,
  106. resolve_entities=resolve_entities,
  107. remove_comments=remove_comments,
  108. remove_pis=remove_pis,
  109. strip_cdata=strip_cdata,
  110. collect_ids=True,
  111. target=None, # TODO
  112. compact=compact)
  113. self._events = parser.read_events()
  114. self._parser = parser
  115. @property
  116. def error_log(self):
  117. """The error log of the last (or current) parser run.
  118. """
  119. return self._parser.feed_error_log
  120. @property
  121. def resolvers(self):
  122. """The custom resolver registry of the last (or current) parser run.
  123. """
  124. return self._parser.resolvers
  125. @property
  126. def version(self):
  127. """The version of the underlying XML parser."""
  128. return self._parser.version
  129. def set_element_class_lookup(self, ElementClassLookup lookup = None):
  130. """set_element_class_lookup(self, lookup = None)
  131. Set a lookup scheme for element classes generated from this parser.
  132. Reset it by passing None or nothing.
  133. """
  134. self._parser.set_element_class_lookup(lookup)
  135. def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
  136. """makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
  137. Creates a new element associated with this parser.
  138. """
  139. self._parser.makeelement(
  140. _tag, attrib=None, nsmap=None, **_extra)
  141. @cython.final
  142. cdef _close_source(self):
  143. if self._source is None:
  144. return
  145. if not self._close_source_after_read:
  146. self._source = None
  147. return
  148. try:
  149. close = self._source.close
  150. except AttributeError:
  151. close = None
  152. finally:
  153. self._source = None
  154. if close is not None:
  155. close()
  156. def __iter__(self):
  157. return self
  158. def __next__(self):
  159. try:
  160. return next(self._events)
  161. except StopIteration:
  162. pass
  163. context = <_SaxParserContext>self._parser._getPushParserContext()
  164. if self._source is not None:
  165. done = False
  166. while not done:
  167. try:
  168. done = self._read_more_events(context)
  169. return next(self._events)
  170. except StopIteration:
  171. pass # no events yet
  172. except Exception as e:
  173. self._error = e
  174. self._close_source()
  175. try:
  176. return next(self._events)
  177. except StopIteration:
  178. break
  179. # nothing left to read or return
  180. if self._error is not None:
  181. error = self._error
  182. self._error = None
  183. raise error
  184. if (context._validator is not None
  185. and not context._validator.isvalid()):
  186. _raiseParseError(context._c_ctxt, self._filename,
  187. context._error_log)
  188. # no errors => all done
  189. raise StopIteration
  190. @cython.final
  191. cdef bint _read_more_events(self, _SaxParserContext context) except -123:
  192. data = self._source.read(__ITERPARSE_CHUNK_SIZE)
  193. if not isinstance(data, bytes):
  194. self._close_source()
  195. raise TypeError("reading file objects must return bytes objects")
  196. if not data:
  197. try:
  198. self.root = self._parser.close()
  199. finally:
  200. self._close_source()
  201. return True
  202. self._parser.feed(data)
  203. return False
  204. cdef enum _IterwalkSkipStates:
  205. IWSKIP_NEXT_IS_START
  206. IWSKIP_SKIP_NEXT
  207. IWSKIP_CAN_SKIP
  208. IWSKIP_CANNOT_SKIP
  209. cdef class iterwalk:
  210. """iterwalk(self, element_or_tree, events=("end",), tag=None)
  211. A tree walker that generates events from an existing tree as if it
  212. was parsing XML data with ``iterparse()``.
  213. Just as for ``iterparse()``, the ``tag`` argument can be a single tag or a
  214. sequence of tags.
  215. After receiving a 'start' or 'start-ns' event, the children and
  216. descendants of the current element can be excluded from iteration
  217. by calling the ``skip_subtree()`` method.
  218. """
  219. cdef _MultiTagMatcher _matcher
  220. cdef list _node_stack
  221. cdef list _events
  222. cdef object _pop_event
  223. cdef object _include_siblings
  224. cdef int _index
  225. cdef int _event_filter
  226. cdef _IterwalkSkipStates _skip_state
  227. def __init__(self, element_or_tree, events=("end",), tag=None):
  228. cdef _Element root
  229. cdef int ns_count
  230. root = _rootNodeOrRaise(element_or_tree)
  231. self._event_filter = _buildParseEventFilter(events)
  232. if tag is None or tag == '*':
  233. self._matcher = None
  234. else:
  235. self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag)
  236. self._node_stack = []
  237. self._events = []
  238. self._pop_event = self._events.pop
  239. self._skip_state = IWSKIP_CANNOT_SKIP # ignore all skip requests by default
  240. if self._event_filter:
  241. self._index = 0
  242. if self._matcher is not None and self._event_filter & PARSE_EVENT_FILTER_START:
  243. self._matcher.cacheTags(root._doc)
  244. # When processing an ElementTree, add events for the preceding comments/PIs.
  245. if self._event_filter & (PARSE_EVENT_FILTER_COMMENT | PARSE_EVENT_FILTER_PI):
  246. if isinstance(element_or_tree, _ElementTree):
  247. self._include_siblings = root
  248. for elem in list(root.itersiblings(preceding=True))[::-1]:
  249. if self._event_filter & PARSE_EVENT_FILTER_COMMENT and elem.tag is Comment:
  250. self._events.append(('comment', elem))
  251. elif self._event_filter & PARSE_EVENT_FILTER_PI and elem.tag is PI:
  252. self._events.append(('pi', elem))
  253. ns_count = self._start_node(root)
  254. self._node_stack.append( (root, ns_count) )
  255. else:
  256. self._index = -1
  257. def __iter__(self):
  258. return self
  259. def __next__(self):
  260. cdef xmlNode* c_child
  261. cdef _Element node
  262. cdef _Element next_node
  263. cdef int ns_count = 0
  264. if self._events:
  265. return self._next_event()
  266. if self._matcher is not None and self._index >= 0:
  267. node = self._node_stack[self._index][0]
  268. self._matcher.cacheTags(node._doc)
  269. # find next node
  270. while self._index >= 0:
  271. node = self._node_stack[self._index][0]
  272. if self._skip_state == IWSKIP_SKIP_NEXT:
  273. c_child = NULL
  274. else:
  275. c_child = self._process_non_elements(
  276. node._doc, _findChildForwards(node._c_node, 0))
  277. self._skip_state = IWSKIP_CANNOT_SKIP
  278. while c_child is NULL:
  279. # back off through parents
  280. self._index -= 1
  281. node = self._end_node()
  282. if self._index < 0:
  283. break
  284. c_child = self._process_non_elements(
  285. node._doc, _nextElement(node._c_node))
  286. if c_child is not NULL:
  287. next_node = _elementFactory(node._doc, c_child)
  288. if self._event_filter & (PARSE_EVENT_FILTER_START |
  289. PARSE_EVENT_FILTER_START_NS):
  290. ns_count = self._start_node(next_node)
  291. elif self._event_filter & PARSE_EVENT_FILTER_END_NS:
  292. ns_count = _countNsDefs(next_node._c_node)
  293. self._node_stack.append( (next_node, ns_count) )
  294. self._index += 1
  295. if self._events:
  296. return self._next_event()
  297. if self._include_siblings is not None:
  298. node, self._include_siblings = self._include_siblings, None
  299. self._process_non_elements(node._doc, _nextElement(node._c_node))
  300. if self._events:
  301. return self._next_event()
  302. raise StopIteration
  303. @cython.final
  304. cdef xmlNode* _process_non_elements(self, _Document doc, xmlNode* c_node):
  305. while c_node is not NULL and c_node.type != tree.XML_ELEMENT_NODE:
  306. if c_node.type == tree.XML_COMMENT_NODE:
  307. if self._event_filter & PARSE_EVENT_FILTER_COMMENT:
  308. self._events.append(
  309. ("comment", _elementFactory(doc, c_node)))
  310. c_node = _nextElement(c_node)
  311. elif c_node.type == tree.XML_PI_NODE:
  312. if self._event_filter & PARSE_EVENT_FILTER_PI:
  313. self._events.append(
  314. ("pi", _elementFactory(doc, c_node)))
  315. c_node = _nextElement(c_node)
  316. else:
  317. break
  318. return c_node
  319. @cython.final
  320. cdef _next_event(self):
  321. if self._skip_state == IWSKIP_NEXT_IS_START:
  322. if self._events[0][0] in ('start', 'start-ns'):
  323. self._skip_state = IWSKIP_CAN_SKIP
  324. return self._pop_event(0)
  325. def skip_subtree(self):
  326. """Prevent descending into the current subtree.
  327. Instead, the next returned event will be the 'end' event of the current element
  328. (if included), ignoring any children or descendants.
  329. This has no effect right after an 'end' or 'end-ns' event.
  330. """
  331. if self._skip_state == IWSKIP_CAN_SKIP:
  332. self._skip_state = IWSKIP_SKIP_NEXT
  333. @cython.final
  334. cdef int _start_node(self, _Element node) except -1:
  335. cdef int ns_count
  336. if self._event_filter & PARSE_EVENT_FILTER_START_NS:
  337. ns_count = _appendStartNsEvents(node._c_node, self._events)
  338. if self._events:
  339. self._skip_state = IWSKIP_NEXT_IS_START
  340. elif self._event_filter & PARSE_EVENT_FILTER_END_NS:
  341. ns_count = _countNsDefs(node._c_node)
  342. else:
  343. ns_count = 0
  344. if self._event_filter & PARSE_EVENT_FILTER_START:
  345. if self._matcher is None or self._matcher.matches(node._c_node):
  346. self._events.append( ("start", node) )
  347. self._skip_state = IWSKIP_NEXT_IS_START
  348. return ns_count
  349. @cython.final
  350. cdef _Element _end_node(self):
  351. cdef _Element node
  352. cdef int i, ns_count
  353. node, ns_count = self._node_stack.pop()
  354. if self._event_filter & PARSE_EVENT_FILTER_END:
  355. if self._matcher is None or self._matcher.matches(node._c_node):
  356. self._events.append( ("end", node) )
  357. if self._event_filter & PARSE_EVENT_FILTER_END_NS and ns_count:
  358. event = ("end-ns", None)
  359. for i in range(ns_count):
  360. self._events.append(event)
  361. return node
  362. cdef int _countNsDefs(xmlNode* c_node) noexcept:
  363. cdef xmlNs* c_ns
  364. cdef int count
  365. count = 0
  366. c_ns = c_node.nsDef
  367. while c_ns is not NULL:
  368. count += (c_ns.href is not NULL)
  369. c_ns = c_ns.next
  370. return count
  371. cdef int _appendStartNsEvents(xmlNode* c_node, list event_list) except -1:
  372. cdef xmlNs* c_ns
  373. cdef int count
  374. count = 0
  375. c_ns = c_node.nsDef
  376. while c_ns is not NULL:
  377. if c_ns.href:
  378. ns_tuple = (funicodeOrEmpty(c_ns.prefix),
  379. funicode(c_ns.href))
  380. event_list.append( ("start-ns", ns_tuple) )
  381. count += 1
  382. c_ns = c_ns.next
  383. return count