You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

831 line
31 KiB

  1. # support for extension functions in XPath and XSLT
  2. cdef class XPathError(LxmlError):
  3. """Base class of all XPath errors.
  4. """
  5. cdef class XPathEvalError(XPathError):
  6. """Error during XPath evaluation.
  7. """
  8. cdef class XPathFunctionError(XPathEvalError):
  9. """Internal error looking up an XPath extension function.
  10. """
  11. cdef class XPathResultError(XPathEvalError):
  12. """Error handling an XPath result.
  13. """
  14. # forward declarations
  15. ctypedef int (*_register_function)(void* ctxt, name_utf, ns_uri_utf)
  16. cdef class _ExsltRegExp
  17. ################################################################################
  18. # Base class for XSLT and XPath evaluation contexts: functions, namespaces, ...
  19. @cython.internal
  20. cdef class _BaseContext:
  21. cdef xpath.xmlXPathContext* _xpathCtxt
  22. cdef _Document _doc
  23. cdef dict _extensions
  24. cdef list _namespaces
  25. cdef list _global_namespaces
  26. cdef dict _utf_refs
  27. cdef dict _function_cache
  28. cdef dict _eval_context_dict
  29. cdef bint _build_smart_strings
  30. # for exception handling and temporary reference keeping:
  31. cdef _TempStore _temp_refs
  32. cdef set _temp_documents
  33. cdef _ExceptionContext _exc
  34. cdef _ErrorLog _error_log
  35. def __init__(self, namespaces, extensions, error_log, enable_regexp,
  36. build_smart_strings):
  37. cdef _ExsltRegExp _regexp
  38. cdef dict new_extensions
  39. cdef list ns
  40. self._utf_refs = {}
  41. self._global_namespaces = []
  42. self._function_cache = {}
  43. self._eval_context_dict = None
  44. self._error_log = error_log
  45. if extensions is not None:
  46. # convert extensions to UTF-8
  47. if isinstance(extensions, dict):
  48. extensions = (extensions,)
  49. # format: [ {(ns, name):function} ] -> {(ns_utf, name_utf):function}
  50. new_extensions = {}
  51. for extension in extensions:
  52. for (ns_uri, name), function in extension.items():
  53. if name is None:
  54. raise ValueError, "extensions must have non empty names"
  55. ns_utf = self._to_utf(ns_uri)
  56. name_utf = self._to_utf(name)
  57. new_extensions[(ns_utf, name_utf)] = function
  58. extensions = new_extensions or None
  59. if namespaces is not None:
  60. if isinstance(namespaces, dict):
  61. namespaces = namespaces.items()
  62. if namespaces:
  63. ns = []
  64. for prefix, ns_uri in namespaces:
  65. if prefix is None or not prefix:
  66. raise TypeError, \
  67. "empty namespace prefix is not supported in XPath"
  68. if ns_uri is None or not ns_uri:
  69. raise TypeError, \
  70. "setting default namespace is not supported in XPath"
  71. prefix_utf = self._to_utf(prefix)
  72. ns_uri_utf = self._to_utf(ns_uri)
  73. ns.append( (prefix_utf, ns_uri_utf) )
  74. namespaces = ns
  75. else:
  76. namespaces = None
  77. self._doc = None
  78. self._exc = _ExceptionContext()
  79. self._extensions = extensions
  80. self._namespaces = namespaces
  81. self._temp_refs = _TempStore()
  82. self._temp_documents = set()
  83. self._build_smart_strings = build_smart_strings
  84. if enable_regexp:
  85. _regexp = _ExsltRegExp()
  86. _regexp._register_in_context(self)
  87. cdef _BaseContext _copy(self):
  88. cdef _BaseContext context
  89. if self._namespaces is not None:
  90. namespaces = self._namespaces[:]
  91. else:
  92. namespaces = None
  93. context = self.__class__(namespaces, None, self._error_log, False,
  94. self._build_smart_strings)
  95. if self._extensions is not None:
  96. context._extensions = self._extensions.copy()
  97. return context
  98. cdef bytes _to_utf(self, s):
  99. "Convert to UTF-8 and keep a reference to the encoded string"
  100. cdef python.PyObject* dict_result
  101. if s is None:
  102. return None
  103. dict_result = python.PyDict_GetItem(self._utf_refs, s)
  104. if dict_result is not NULL:
  105. return <bytes>dict_result
  106. utf = _utf8(s)
  107. self._utf_refs[s] = utf
  108. if python.IS_PYPY:
  109. # use C level refs, PyPy refs are not enough!
  110. python.Py_INCREF(utf)
  111. return utf
  112. cdef void _set_xpath_context(self, xpath.xmlXPathContext* xpathCtxt) noexcept:
  113. self._xpathCtxt = xpathCtxt
  114. xpathCtxt.userData = <void*>self
  115. # Need a cast here because older libxml2 releases do not use 'const' in the functype.
  116. xpathCtxt.error = <xmlerror.xmlStructuredErrorFunc> _receiveXPathError
  117. @cython.final
  118. cdef _register_context(self, _Document doc):
  119. self._doc = doc
  120. self._exc.clear()
  121. @cython.final
  122. cdef _cleanup_context(self):
  123. #xpath.xmlXPathRegisteredNsCleanup(self._xpathCtxt)
  124. #self.unregisterGlobalNamespaces()
  125. if python.IS_PYPY:
  126. # clean up double refs in PyPy (see "_to_utf()" method)
  127. for ref in self._utf_refs.itervalues():
  128. python.Py_DECREF(ref)
  129. self._utf_refs.clear()
  130. self._eval_context_dict = None
  131. self._doc = None
  132. @cython.final
  133. cdef _release_context(self):
  134. if self._xpathCtxt is not NULL:
  135. self._xpathCtxt.userData = NULL
  136. self._xpathCtxt = NULL
  137. # namespaces (internal UTF-8 methods with leading '_')
  138. cdef addNamespace(self, prefix, ns_uri):
  139. cdef list namespaces
  140. if prefix is None:
  141. raise TypeError, "empty prefix is not supported in XPath"
  142. prefix_utf = self._to_utf(prefix)
  143. ns_uri_utf = self._to_utf(ns_uri)
  144. new_item = (prefix_utf, ns_uri_utf)
  145. if self._namespaces is None:
  146. self._namespaces = [new_item]
  147. else:
  148. namespaces = []
  149. for item in self._namespaces:
  150. if item[0] == prefix_utf:
  151. item = new_item
  152. new_item = None
  153. namespaces.append(item)
  154. if new_item is not None:
  155. namespaces.append(new_item)
  156. self._namespaces = namespaces
  157. if self._xpathCtxt is not NULL:
  158. xpath.xmlXPathRegisterNs(
  159. self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
  160. cdef registerNamespace(self, prefix, ns_uri):
  161. if prefix is None:
  162. raise TypeError, "empty prefix is not supported in XPath"
  163. prefix_utf = self._to_utf(prefix)
  164. ns_uri_utf = self._to_utf(ns_uri)
  165. self._global_namespaces.append(prefix_utf)
  166. xpath.xmlXPathRegisterNs(self._xpathCtxt,
  167. _xcstr(prefix_utf), _xcstr(ns_uri_utf))
  168. cdef registerLocalNamespaces(self):
  169. if self._namespaces is None:
  170. return
  171. for prefix_utf, ns_uri_utf in self._namespaces:
  172. xpath.xmlXPathRegisterNs(
  173. self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
  174. cdef registerGlobalNamespaces(self):
  175. cdef list ns_prefixes = _find_all_extension_prefixes()
  176. if python.PyList_GET_SIZE(ns_prefixes) > 0:
  177. for prefix_utf, ns_uri_utf in ns_prefixes:
  178. self._global_namespaces.append(prefix_utf)
  179. xpath.xmlXPathRegisterNs(
  180. self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
  181. cdef unregisterGlobalNamespaces(self):
  182. if python.PyList_GET_SIZE(self._global_namespaces) > 0:
  183. for prefix_utf in self._global_namespaces:
  184. xpath.xmlXPathRegisterNs(self._xpathCtxt,
  185. _xcstr(prefix_utf), NULL)
  186. del self._global_namespaces[:]
  187. cdef void _unregisterNamespace(self, prefix_utf) noexcept:
  188. xpath.xmlXPathRegisterNs(self._xpathCtxt,
  189. _xcstr(prefix_utf), NULL)
  190. # extension functions
  191. cdef int _addLocalExtensionFunction(self, ns_utf, name_utf, function) except -1:
  192. if self._extensions is None:
  193. self._extensions = {}
  194. self._extensions[(ns_utf, name_utf)] = function
  195. return 0
  196. cdef registerGlobalFunctions(self, void* ctxt,
  197. _register_function reg_func):
  198. cdef python.PyObject* dict_result
  199. cdef dict d
  200. for ns_utf, ns_functions in __FUNCTION_NAMESPACE_REGISTRIES.iteritems():
  201. dict_result = python.PyDict_GetItem(
  202. self._function_cache, ns_utf)
  203. if dict_result is not NULL:
  204. d = <dict>dict_result
  205. else:
  206. d = {}
  207. self._function_cache[ns_utf] = d
  208. for name_utf, function in ns_functions.iteritems():
  209. d[name_utf] = function
  210. reg_func(ctxt, name_utf, ns_utf)
  211. cdef registerLocalFunctions(self, void* ctxt,
  212. _register_function reg_func):
  213. cdef python.PyObject* dict_result
  214. cdef dict d
  215. if self._extensions is None:
  216. return # done
  217. last_ns = None
  218. d = None
  219. for (ns_utf, name_utf), function in self._extensions.iteritems():
  220. if ns_utf is not last_ns or d is None:
  221. last_ns = ns_utf
  222. dict_result = python.PyDict_GetItem(
  223. self._function_cache, ns_utf)
  224. if dict_result is not NULL:
  225. d = <dict>dict_result
  226. else:
  227. d = {}
  228. self._function_cache[ns_utf] = d
  229. d[name_utf] = function
  230. reg_func(ctxt, name_utf, ns_utf)
  231. cdef unregisterAllFunctions(self, void* ctxt,
  232. _register_function unreg_func):
  233. for ns_utf, functions in self._function_cache.iteritems():
  234. for name_utf in functions:
  235. unreg_func(ctxt, name_utf, ns_utf)
  236. cdef unregisterGlobalFunctions(self, void* ctxt,
  237. _register_function unreg_func):
  238. for ns_utf, functions in self._function_cache.items():
  239. for name_utf in functions:
  240. if self._extensions is None or \
  241. (ns_utf, name_utf) not in self._extensions:
  242. unreg_func(ctxt, name_utf, ns_utf)
  243. @cython.final
  244. cdef _find_cached_function(self, const_xmlChar* c_ns_uri, const_xmlChar* c_name):
  245. """Lookup an extension function in the cache and return it.
  246. Parameters: c_ns_uri may be NULL, c_name must not be NULL
  247. """
  248. cdef python.PyObject* c_dict
  249. cdef python.PyObject* dict_result
  250. c_dict = python.PyDict_GetItem(
  251. self._function_cache, None if c_ns_uri is NULL else c_ns_uri)
  252. if c_dict is not NULL:
  253. dict_result = python.PyDict_GetItem(
  254. <object>c_dict, <unsigned char*>c_name)
  255. if dict_result is not NULL:
  256. return <object>dict_result
  257. return None
  258. # Python access to the XPath context for extension functions
  259. @property
  260. def context_node(self):
  261. cdef xmlNode* c_node
  262. if self._xpathCtxt is NULL:
  263. raise XPathError, \
  264. "XPath context is only usable during the evaluation"
  265. c_node = self._xpathCtxt.node
  266. if c_node is NULL:
  267. raise XPathError, "no context node"
  268. if c_node.doc != self._xpathCtxt.doc:
  269. raise XPathError, \
  270. "document-external context nodes are not supported"
  271. if self._doc is None:
  272. raise XPathError, "document context is missing"
  273. return _elementFactory(self._doc, c_node)
  274. @property
  275. def eval_context(self):
  276. if self._eval_context_dict is None:
  277. self._eval_context_dict = {}
  278. return self._eval_context_dict
  279. # Python reference keeping during XPath function evaluation
  280. @cython.final
  281. cdef _release_temp_refs(self):
  282. "Free temporarily referenced objects from this context."
  283. self._temp_refs.clear()
  284. self._temp_documents.clear()
  285. @cython.final
  286. cdef _hold(self, obj):
  287. """A way to temporarily hold references to nodes in the evaluator.
  288. This is needed because otherwise nodes created in XPath extension
  289. functions would be reference counted too soon, during the XPath
  290. evaluation. This is most important in the case of exceptions.
  291. """
  292. cdef _Element element
  293. if isinstance(obj, _Element):
  294. self._temp_refs.add(obj)
  295. self._temp_documents.add((<_Element>obj)._doc)
  296. return
  297. elif _isString(obj) or not python.PySequence_Check(obj):
  298. return
  299. for o in obj:
  300. if isinstance(o, _Element):
  301. #print "Holding element:", <int>element._c_node
  302. self._temp_refs.add(o)
  303. #print "Holding document:", <int>element._doc._c_doc
  304. self._temp_documents.add((<_Element>o)._doc)
  305. @cython.final
  306. cdef _Document _findDocumentForNode(self, xmlNode* c_node):
  307. """If an XPath expression returns an element from a different
  308. document than the current context document, we call this to
  309. see if it was possibly created by an extension and is a known
  310. document instance.
  311. """
  312. cdef _Document doc
  313. for doc in self._temp_documents:
  314. if doc is not None and doc._c_doc is c_node.doc:
  315. return doc
  316. return None
  317. # libxml2 keeps these error messages in a static array in its code
  318. # and doesn't give us access to them ...
  319. cdef tuple LIBXML2_XPATH_ERROR_MESSAGES = (
  320. b"Ok",
  321. b"Number encoding",
  322. b"Unfinished literal",
  323. b"Start of literal",
  324. b"Expected $ for variable reference",
  325. b"Undefined variable",
  326. b"Invalid predicate",
  327. b"Invalid expression",
  328. b"Missing closing curly brace",
  329. b"Unregistered function",
  330. b"Invalid operand",
  331. b"Invalid type",
  332. b"Invalid number of arguments",
  333. b"Invalid context size",
  334. b"Invalid context position",
  335. b"Memory allocation error",
  336. b"Syntax error",
  337. b"Resource error",
  338. b"Sub resource error",
  339. b"Undefined namespace prefix",
  340. b"Encoding error",
  341. b"Char out of XML range",
  342. b"Invalid or incomplete context",
  343. b"Stack usage error",
  344. b"Forbidden variable\n",
  345. b"?? Unknown error ??\n",
  346. )
  347. cdef void _forwardXPathError(void* c_ctxt, const xmlerror.xmlError* c_error) noexcept with gil:
  348. cdef xmlerror.xmlError error
  349. cdef int xpath_code
  350. if c_error.message is not NULL:
  351. error.message = c_error.message
  352. else:
  353. xpath_code = c_error.code - xmlerror.XML_XPATH_EXPRESSION_OK
  354. if 0 <= xpath_code < len(LIBXML2_XPATH_ERROR_MESSAGES):
  355. error.message = _cstr(LIBXML2_XPATH_ERROR_MESSAGES[xpath_code])
  356. else:
  357. error.message = b"unknown error"
  358. error.domain = c_error.domain
  359. error.code = c_error.code
  360. error.level = c_error.level
  361. error.line = c_error.line
  362. error.int2 = c_error.int1 # column
  363. error.file = c_error.file
  364. error.node = NULL
  365. (<_BaseContext>c_ctxt)._error_log._receive(&error)
  366. cdef void _receiveXPathError(void* c_context, const xmlerror.xmlError* error) noexcept nogil:
  367. if not __DEBUG:
  368. return
  369. if c_context is NULL:
  370. _forwardError(NULL, error)
  371. else:
  372. _forwardXPathError(c_context, error)
  373. def Extension(module, function_mapping=None, *, ns=None):
  374. """Extension(module, function_mapping=None, ns=None)
  375. Build a dictionary of extension functions from the functions
  376. defined in a module or the methods of an object.
  377. As second argument, you can pass an additional mapping of
  378. attribute names to XPath function names, or a list of function
  379. names that should be taken.
  380. The ``ns`` keyword argument accepts a namespace URI for the XPath
  381. functions.
  382. """
  383. cdef dict functions = {}
  384. if isinstance(function_mapping, dict):
  385. for function_name, xpath_name in function_mapping.items():
  386. functions[(ns, xpath_name)] = getattr(module, function_name)
  387. else:
  388. if function_mapping is None:
  389. function_mapping = [ name for name in dir(module)
  390. if not name.startswith('_') ]
  391. for function_name in function_mapping:
  392. functions[(ns, function_name)] = getattr(module, function_name)
  393. return functions
  394. ################################################################################
  395. # EXSLT regexp implementation
  396. @cython.final
  397. @cython.internal
  398. cdef class _ExsltRegExp:
  399. cdef dict _compile_map
  400. def __cinit__(self):
  401. self._compile_map = {}
  402. cdef _make_string(self, value):
  403. if _isString(value):
  404. return value
  405. elif isinstance(value, list):
  406. # node set: take recursive text concatenation of first element
  407. if python.PyList_GET_SIZE(value) == 0:
  408. return ''
  409. firstnode = value[0]
  410. if _isString(firstnode):
  411. return firstnode
  412. elif isinstance(firstnode, _Element):
  413. c_text = tree.xmlNodeGetContent((<_Element>firstnode)._c_node)
  414. if c_text is NULL:
  415. raise MemoryError()
  416. try:
  417. return funicode(c_text)
  418. finally:
  419. tree.xmlFree(c_text)
  420. else:
  421. return unicode(firstnode)
  422. else:
  423. return unicode(value)
  424. cdef _compile(self, rexp, ignore_case):
  425. cdef python.PyObject* c_result
  426. rexp = self._make_string(rexp)
  427. key = (rexp, ignore_case)
  428. c_result = python.PyDict_GetItem(self._compile_map, key)
  429. if c_result is not NULL:
  430. return <object>c_result
  431. py_flags = re.UNICODE
  432. if ignore_case:
  433. py_flags = py_flags | re.IGNORECASE
  434. rexp_compiled = re.compile(rexp, py_flags)
  435. self._compile_map[key] = rexp_compiled
  436. return rexp_compiled
  437. def test(self, ctxt, s, rexp, flags=''):
  438. flags = self._make_string(flags)
  439. s = self._make_string(s)
  440. rexpc = self._compile(rexp, 'i' in flags)
  441. if rexpc.search(s) is None:
  442. return False
  443. else:
  444. return True
  445. def match(self, ctxt, s, rexp, flags=''):
  446. cdef list result_list
  447. flags = self._make_string(flags)
  448. s = self._make_string(s)
  449. rexpc = self._compile(rexp, 'i' in flags)
  450. if 'g' in flags:
  451. results = rexpc.findall(s)
  452. if not results:
  453. return ()
  454. else:
  455. result = rexpc.search(s)
  456. if not result:
  457. return ()
  458. results = [ result.group() ]
  459. results.extend( result.groups('') )
  460. result_list = []
  461. root = Element('matches')
  462. for s_match in results:
  463. if python.PyTuple_CheckExact(s_match):
  464. s_match = ''.join(s_match)
  465. elem = SubElement(root, 'match')
  466. elem.text = s_match
  467. result_list.append(elem)
  468. return result_list
  469. def replace(self, ctxt, s, rexp, flags, replacement):
  470. replacement = self._make_string(replacement)
  471. flags = self._make_string(flags)
  472. s = self._make_string(s)
  473. rexpc = self._compile(rexp, 'i' in flags)
  474. count: object = 0 if 'g' in flags else 1
  475. return rexpc.sub(replacement, s, count)
  476. cdef _register_in_context(self, _BaseContext context):
  477. ns = b"http://exslt.org/regular-expressions"
  478. context._addLocalExtensionFunction(ns, b"test", self.test)
  479. context._addLocalExtensionFunction(ns, b"match", self.match)
  480. context._addLocalExtensionFunction(ns, b"replace", self.replace)
  481. ################################################################################
  482. # helper functions
  483. cdef xpath.xmlXPathObject* _wrapXPathObject(object obj, _Document doc,
  484. _BaseContext context) except NULL:
  485. cdef xpath.xmlNodeSet* resultSet
  486. cdef _Element fake_node = None
  487. cdef xmlNode* c_node
  488. if isinstance(obj, unicode):
  489. obj = _utf8(obj)
  490. if isinstance(obj, bytes):
  491. # libxml2 copies the string value
  492. return xpath.xmlXPathNewCString(_cstr(obj))
  493. if isinstance(obj, bool):
  494. return xpath.xmlXPathNewBoolean(obj)
  495. if python.PyNumber_Check(obj):
  496. return xpath.xmlXPathNewFloat(obj)
  497. if obj is None:
  498. resultSet = xpath.xmlXPathNodeSetCreate(NULL)
  499. elif isinstance(obj, _Element):
  500. resultSet = xpath.xmlXPathNodeSetCreate((<_Element>obj)._c_node)
  501. elif python.PySequence_Check(obj):
  502. resultSet = xpath.xmlXPathNodeSetCreate(NULL)
  503. try:
  504. for value in obj:
  505. if isinstance(value, _Element):
  506. if context is not None:
  507. context._hold(value)
  508. xpath.xmlXPathNodeSetAdd(resultSet, (<_Element>value)._c_node)
  509. else:
  510. if context is None or doc is None:
  511. raise XPathResultError, \
  512. f"Non-Element values not supported at this point - got {value!r}"
  513. # support strings by appending text nodes to an Element
  514. if isinstance(value, unicode):
  515. value = _utf8(value)
  516. if isinstance(value, bytes):
  517. if fake_node is None:
  518. fake_node = _makeElement("text-root", NULL, doc, None,
  519. None, None, None, None, None)
  520. context._hold(fake_node)
  521. else:
  522. # append a comment node to keep the text nodes separate
  523. c_node = tree.xmlNewDocComment(doc._c_doc, <unsigned char*>"")
  524. if c_node is NULL:
  525. raise MemoryError()
  526. tree.xmlAddChild(fake_node._c_node, c_node)
  527. context._hold(value)
  528. c_node = tree.xmlNewDocText(doc._c_doc, _xcstr(value))
  529. if c_node is NULL:
  530. raise MemoryError()
  531. tree.xmlAddChild(fake_node._c_node, c_node)
  532. xpath.xmlXPathNodeSetAdd(resultSet, c_node)
  533. else:
  534. raise XPathResultError, \
  535. f"This is not a supported node-set result: {value!r}"
  536. except:
  537. xpath.xmlXPathFreeNodeSet(resultSet)
  538. raise
  539. else:
  540. raise XPathResultError, f"Unknown return type: {python._fqtypename(obj).decode('utf8')}"
  541. return xpath.xmlXPathWrapNodeSet(resultSet)
  542. cdef object _unwrapXPathObject(xpath.xmlXPathObject* xpathObj,
  543. _Document doc, _BaseContext context):
  544. if xpathObj.type == xpath.XPATH_UNDEFINED:
  545. raise XPathResultError, "Undefined xpath result"
  546. elif xpathObj.type == xpath.XPATH_NODESET:
  547. return _createNodeSetResult(xpathObj, doc, context)
  548. elif xpathObj.type == xpath.XPATH_BOOLEAN:
  549. return xpathObj.boolval
  550. elif xpathObj.type == xpath.XPATH_NUMBER:
  551. return xpathObj.floatval
  552. elif xpathObj.type == xpath.XPATH_STRING:
  553. stringval = funicode(xpathObj.stringval)
  554. if context._build_smart_strings:
  555. stringval = _elementStringResultFactory(
  556. stringval, None, None, False)
  557. return stringval
  558. elif xpathObj.type == xpath.XPATH_POINT:
  559. raise NotImplementedError, "XPATH_POINT"
  560. elif xpathObj.type == xpath.XPATH_RANGE:
  561. raise NotImplementedError, "XPATH_RANGE"
  562. elif xpathObj.type == xpath.XPATH_LOCATIONSET:
  563. raise NotImplementedError, "XPATH_LOCATIONSET"
  564. elif xpathObj.type == xpath.XPATH_USERS:
  565. raise NotImplementedError, "XPATH_USERS"
  566. elif xpathObj.type == xpath.XPATH_XSLT_TREE:
  567. return _createNodeSetResult(xpathObj, doc, context)
  568. else:
  569. raise XPathResultError, f"Unknown xpath result {xpathObj.type}"
  570. cdef object _createNodeSetResult(xpath.xmlXPathObject* xpathObj, _Document doc,
  571. _BaseContext context):
  572. cdef xmlNode* c_node
  573. cdef int i
  574. cdef list result
  575. result = []
  576. if xpathObj.nodesetval is NULL:
  577. return result
  578. for i in range(xpathObj.nodesetval.nodeNr):
  579. c_node = xpathObj.nodesetval.nodeTab[i]
  580. _unpackNodeSetEntry(result, c_node, doc, context,
  581. xpathObj.type == xpath.XPATH_XSLT_TREE)
  582. return result
  583. cdef _unpackNodeSetEntry(list results, xmlNode* c_node, _Document doc,
  584. _BaseContext context, bint is_fragment):
  585. cdef xmlNode* c_child
  586. if _isElement(c_node):
  587. if c_node.doc != doc._c_doc and c_node.doc._private is NULL:
  588. # XXX: works, but maybe not always the right thing to do?
  589. # XPath: only runs when extensions create or copy trees
  590. # -> we store Python refs to these, so that is OK
  591. # XSLT: can it leak when merging trees from multiple sources?
  592. c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1)
  593. # FIXME: call _instantiateElementFromXPath() instead?
  594. results.append(
  595. _fakeDocElementFactory(doc, c_node))
  596. elif c_node.type == tree.XML_TEXT_NODE or \
  597. c_node.type == tree.XML_CDATA_SECTION_NODE or \
  598. c_node.type == tree.XML_ATTRIBUTE_NODE:
  599. results.append(
  600. _buildElementStringResult(doc, c_node, context))
  601. elif c_node.type == tree.XML_NAMESPACE_DECL:
  602. results.append( (funicodeOrNone((<xmlNs*>c_node).prefix),
  603. funicodeOrNone((<xmlNs*>c_node).href)) )
  604. elif c_node.type == tree.XML_DOCUMENT_NODE or \
  605. c_node.type == tree.XML_HTML_DOCUMENT_NODE:
  606. # ignored for everything but result tree fragments
  607. if is_fragment:
  608. c_child = c_node.children
  609. while c_child is not NULL:
  610. _unpackNodeSetEntry(results, c_child, doc, context, 0)
  611. c_child = c_child.next
  612. elif c_node.type == tree.XML_XINCLUDE_START or \
  613. c_node.type == tree.XML_XINCLUDE_END:
  614. pass
  615. else:
  616. raise NotImplementedError, \
  617. f"Not yet implemented result node type: {c_node.type}"
  618. cdef void _freeXPathObject(xpath.xmlXPathObject* xpathObj) noexcept:
  619. """Free the XPath object, but *never* free the *content* of node sets.
  620. Python dealloc will do that for us.
  621. """
  622. if xpathObj.nodesetval is not NULL:
  623. xpath.xmlXPathFreeNodeSet(xpathObj.nodesetval)
  624. xpathObj.nodesetval = NULL
  625. xpath.xmlXPathFreeObject(xpathObj)
  626. cdef _Element _instantiateElementFromXPath(xmlNode* c_node, _Document doc,
  627. _BaseContext context):
  628. # NOTE: this may copy the element - only call this when it can't leak
  629. if c_node.doc != doc._c_doc and c_node.doc._private is NULL:
  630. # not from the context document and not from a fake document
  631. # either => may still be from a known document, e.g. one
  632. # created by an extension function
  633. node_doc = context._findDocumentForNode(c_node)
  634. if node_doc is None:
  635. # not from a known document at all! => can only make a
  636. # safety copy here
  637. c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1)
  638. else:
  639. doc = node_doc
  640. return _fakeDocElementFactory(doc, c_node)
  641. ################################################################################
  642. # special str/unicode subclasses
  643. @cython.final
  644. cdef class _ElementUnicodeResult(unicode):
  645. cdef _Element _parent
  646. cdef readonly object attrname
  647. cdef readonly bint is_tail
  648. def getparent(self):
  649. return self._parent
  650. @property
  651. def is_text(self):
  652. return self._parent is not None and not (self.is_tail or self.attrname is not None)
  653. @property
  654. def is_attribute(self):
  655. return self.attrname is not None
  656. cdef object _elementStringResultFactory(string_value, _Element parent,
  657. attrname, bint is_tail):
  658. result = _ElementUnicodeResult(string_value)
  659. result._parent = parent
  660. result.is_tail = is_tail
  661. result.attrname = attrname
  662. return result
  663. cdef object _buildElementStringResult(_Document doc, xmlNode* c_node,
  664. _BaseContext context):
  665. cdef _Element parent = None
  666. cdef object attrname = None
  667. cdef xmlNode* c_element
  668. cdef bint is_tail
  669. if c_node.type == tree.XML_ATTRIBUTE_NODE:
  670. attrname = _namespacedName(c_node)
  671. is_tail = 0
  672. s = tree.xmlNodeGetContent(c_node)
  673. try:
  674. value = funicode(s)
  675. finally:
  676. tree.xmlFree(s)
  677. c_element = NULL
  678. else:
  679. #assert c_node.type == tree.XML_TEXT_NODE or c_node.type == tree.XML_CDATA_SECTION_NODE, "invalid node type"
  680. # may be tail text or normal text
  681. value = funicode(c_node.content)
  682. c_element = _previousElement(c_node)
  683. is_tail = c_element is not NULL
  684. if not context._build_smart_strings:
  685. return value
  686. if c_element is NULL:
  687. # non-tail text or attribute text
  688. c_element = c_node.parent
  689. while c_element is not NULL and not _isElement(c_element):
  690. c_element = c_element.parent
  691. if c_element is not NULL:
  692. parent = _instantiateElementFromXPath(c_element, doc, context)
  693. return _elementStringResultFactory(
  694. value, parent, attrname, is_tail)
  695. ################################################################################
  696. # callbacks for XPath/XSLT extension functions
  697. cdef void _extension_function_call(_BaseContext context, function,
  698. xpath.xmlXPathParserContext* ctxt, int nargs) noexcept:
  699. cdef _Document doc
  700. cdef xpath.xmlXPathObject* obj
  701. cdef list args
  702. cdef int i
  703. doc = context._doc
  704. try:
  705. args = []
  706. for i in range(nargs):
  707. obj = xpath.valuePop(ctxt)
  708. o = _unwrapXPathObject(obj, doc, context)
  709. _freeXPathObject(obj)
  710. args.append(o)
  711. args.reverse()
  712. res = function(context, *args)
  713. # wrap result for XPath consumption
  714. obj = _wrapXPathObject(res, doc, context)
  715. # prevent Python from deallocating elements handed to libxml2
  716. context._hold(res)
  717. xpath.valuePush(ctxt, obj)
  718. except:
  719. xpath.xmlXPathErr(ctxt, xpath.XPATH_EXPR_ERROR)
  720. context._exc._store_raised()
  721. finally:
  722. return # swallow any further exceptions
  723. # lookup the function by name and call it
  724. cdef void _xpath_function_call(xpath.xmlXPathParserContext* ctxt,
  725. int nargs) noexcept with gil:
  726. cdef _BaseContext context
  727. cdef xpath.xmlXPathContext* rctxt = ctxt.context
  728. context = <_BaseContext> rctxt.userData
  729. try:
  730. function = context._find_cached_function(rctxt.functionURI, rctxt.function)
  731. if function is not None:
  732. _extension_function_call(context, function, ctxt, nargs)
  733. else:
  734. xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR)
  735. context._exc._store_exception(XPathFunctionError(
  736. f"XPath function '{_namespacedNameFromNsName(rctxt.functionURI, rctxt.function)}' not found"))
  737. except:
  738. # may not be the right error, but we need to tell libxml2 *something*
  739. xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR)
  740. context._exc._store_raised()
  741. finally:
  742. return # swallow any further exceptions