You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

344 rivejä
11 KiB

  1. # cython: language_level=3
  2. #
  3. # ElementTree
  4. # $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
  5. #
  6. # limited xpath support for element trees
  7. #
  8. # history:
  9. # 2003-05-23 fl created
  10. # 2003-05-28 fl added support for // etc
  11. # 2003-08-27 fl fixed parsing of periods in element names
  12. # 2007-09-10 fl new selection engine
  13. # 2007-09-12 fl fixed parent selector
  14. # 2007-09-13 fl added iterfind; changed findall to return a list
  15. # 2007-11-30 fl added namespaces support
  16. # 2009-10-30 fl added child element value filter
  17. #
  18. # Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved.
  19. #
  20. # fredrik@pythonware.com
  21. # http://www.pythonware.com
  22. #
  23. # --------------------------------------------------------------------
  24. # The ElementTree toolkit is
  25. #
  26. # Copyright (c) 1999-2009 by Fredrik Lundh
  27. #
  28. # By obtaining, using, and/or copying this software and/or its
  29. # associated documentation, you agree that you have read, understood,
  30. # and will comply with the following terms and conditions:
  31. #
  32. # Permission to use, copy, modify, and distribute this software and
  33. # its associated documentation for any purpose and without fee is
  34. # hereby granted, provided that the above copyright notice appears in
  35. # all copies, and that both that copyright notice and this permission
  36. # notice appear in supporting documentation, and that the name of
  37. # Secret Labs AB or the author not be used in advertising or publicity
  38. # pertaining to distribution of the software without specific, written
  39. # prior permission.
  40. #
  41. # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
  42. # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
  43. # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
  44. # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
  45. # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  46. # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  47. # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  48. # OF THIS SOFTWARE.
  49. # --------------------------------------------------------------------
  50. ##
  51. # Implementation module for XPath support. There's usually no reason
  52. # to import this module directly; the <b>ElementTree</b> does this for
  53. # you, if needed.
  54. ##
  55. import re
  56. xpath_tokenizer_re = re.compile(
  57. "("
  58. "'[^']*'|\"[^\"]*\"|"
  59. "::|"
  60. "//?|"
  61. r"\.\.|"
  62. r"\(\)|"
  63. r"[/.*:\[\]\(\)@=])|"
  64. r"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
  65. r"\s+"
  66. )
  67. def xpath_tokenizer(pattern, namespaces=None, with_prefixes=True):
  68. # ElementTree uses '', lxml used None originally.
  69. default_namespace = (namespaces.get(None) or namespaces.get('')) if namespaces else None
  70. parsing_attribute = False
  71. for token in xpath_tokenizer_re.findall(pattern):
  72. ttype, tag = token
  73. if tag and tag[0] != "{":
  74. if ":" in tag and with_prefixes:
  75. prefix, uri = tag.split(":", 1)
  76. try:
  77. if not namespaces:
  78. raise KeyError
  79. yield ttype, "{%s}%s" % (namespaces[prefix], uri)
  80. except KeyError:
  81. raise SyntaxError("prefix %r not found in prefix map" % prefix)
  82. elif tag.isdecimal():
  83. yield token # index
  84. elif default_namespace and not parsing_attribute:
  85. yield ttype, "{%s}%s" % (default_namespace, tag)
  86. else:
  87. yield token
  88. parsing_attribute = False
  89. else:
  90. yield token
  91. parsing_attribute = ttype == '@'
  92. def prepare_child(next, token):
  93. tag = token[1]
  94. def select(result):
  95. for elem in result:
  96. yield from elem.iterchildren(tag)
  97. return select
  98. def prepare_star(next, token):
  99. def select(result):
  100. for elem in result:
  101. yield from elem.iterchildren('*')
  102. return select
  103. def prepare_self(next, token):
  104. def select(result):
  105. return result
  106. return select
  107. def prepare_descendant(next, token):
  108. token = next()
  109. if token[0] == "*":
  110. tag = "*"
  111. elif not token[0]:
  112. tag = token[1]
  113. else:
  114. raise SyntaxError("invalid descendant")
  115. def select(result):
  116. for elem in result:
  117. yield from elem.iterdescendants(tag)
  118. return select
  119. def prepare_parent(next, token):
  120. def select(result):
  121. for elem in result:
  122. parent = elem.getparent()
  123. if parent is not None:
  124. yield parent
  125. return select
  126. def prepare_predicate(next, token):
  127. # FIXME: replace with real parser!!! refs:
  128. # http://effbot.org/zone/simple-iterator-parser.htm
  129. # http://javascript.crockford.com/tdop/tdop.html
  130. signature = ''
  131. predicate = []
  132. while 1:
  133. token = next()
  134. if token[0] == "]":
  135. break
  136. if token == ('', ''):
  137. # ignore whitespace
  138. continue
  139. if token[0] and token[0][:1] in "'\"":
  140. token = "'", token[0][1:-1]
  141. signature += token[0] or "-"
  142. predicate.append(token[1])
  143. # use signature to determine predicate type
  144. if signature == "@-":
  145. # [@attribute] predicate
  146. key = predicate[1]
  147. def select(result):
  148. for elem in result:
  149. if elem.get(key) is not None:
  150. yield elem
  151. return select
  152. if signature == "@-='":
  153. # [@attribute='value']
  154. key = predicate[1]
  155. value = predicate[-1]
  156. def select(result):
  157. for elem in result:
  158. if elem.get(key) == value:
  159. yield elem
  160. return select
  161. if signature == "-" and not re.match(r"-?\d+$", predicate[0]):
  162. # [tag]
  163. tag = predicate[0]
  164. def select(result):
  165. for elem in result:
  166. for _ in elem.iterchildren(tag):
  167. yield elem
  168. break
  169. return select
  170. if signature == ".='" or (signature == "-='" and not re.match(r"-?\d+$", predicate[0])):
  171. # [.='value'] or [tag='value']
  172. tag = predicate[0]
  173. value = predicate[-1]
  174. if tag:
  175. def select(result):
  176. for elem in result:
  177. for e in elem.iterchildren(tag):
  178. if "".join(e.itertext()) == value:
  179. yield elem
  180. break
  181. else:
  182. def select(result):
  183. for elem in result:
  184. if "".join(elem.itertext()) == value:
  185. yield elem
  186. return select
  187. if signature == "-" or signature == "-()" or signature == "-()-":
  188. # [index] or [last()] or [last()-index]
  189. if signature == "-":
  190. # [index]
  191. index = int(predicate[0]) - 1
  192. if index < 0:
  193. if index == -1:
  194. raise SyntaxError(
  195. "indices in path predicates are 1-based, not 0-based")
  196. else:
  197. raise SyntaxError("path index >= 1 expected")
  198. else:
  199. if predicate[0] != "last":
  200. raise SyntaxError("unsupported function")
  201. if signature == "-()-":
  202. try:
  203. index = int(predicate[2]) - 1
  204. except ValueError:
  205. raise SyntaxError("unsupported expression")
  206. else:
  207. index = -1
  208. def select(result):
  209. for elem in result:
  210. parent = elem.getparent()
  211. if parent is None:
  212. continue
  213. try:
  214. # FIXME: what if the selector is "*" ?
  215. elems = list(parent.iterchildren(elem.tag))
  216. if elems[index] is elem:
  217. yield elem
  218. except IndexError:
  219. pass
  220. return select
  221. raise SyntaxError("invalid predicate")
  222. ops = {
  223. "": prepare_child,
  224. "*": prepare_star,
  225. ".": prepare_self,
  226. "..": prepare_parent,
  227. "//": prepare_descendant,
  228. "[": prepare_predicate,
  229. }
  230. # --------------------------------------------------------------------
  231. _cache = {}
  232. def _build_path_iterator(path, namespaces, with_prefixes=True):
  233. """compile selector pattern"""
  234. if path[-1:] == "/":
  235. path += "*" # implicit all (FIXME: keep this?)
  236. cache_key = (path,)
  237. if namespaces:
  238. # lxml originally used None for the default namespace but ElementTree uses the
  239. # more convenient (all-strings-dict) empty string, so we support both here,
  240. # preferring the more convenient '', as long as they aren't ambiguous.
  241. if None in namespaces:
  242. if '' in namespaces and namespaces[None] != namespaces['']:
  243. raise ValueError("Ambiguous default namespace provided: %r versus %r" % (
  244. namespaces[None], namespaces['']))
  245. cache_key += (namespaces[None],) + tuple(sorted(
  246. item for item in namespaces.items() if item[0] is not None))
  247. else:
  248. cache_key += tuple(sorted(namespaces.items()))
  249. try:
  250. return _cache[cache_key]
  251. except KeyError:
  252. pass
  253. if len(_cache) > 100:
  254. _cache.clear()
  255. if path[:1] == "/":
  256. raise SyntaxError("cannot use absolute path on element")
  257. stream = iter(xpath_tokenizer(path, namespaces, with_prefixes=with_prefixes))
  258. try:
  259. _next = stream.next
  260. except AttributeError:
  261. # Python 3
  262. _next = stream.__next__
  263. try:
  264. token = _next()
  265. except StopIteration:
  266. raise SyntaxError("empty path expression")
  267. selector = []
  268. while 1:
  269. try:
  270. selector.append(ops[token[0]](_next, token))
  271. except StopIteration:
  272. raise SyntaxError("invalid path")
  273. try:
  274. token = _next()
  275. if token[0] == "/":
  276. token = _next()
  277. except StopIteration:
  278. break
  279. _cache[cache_key] = selector
  280. return selector
  281. ##
  282. # Iterate over the matching nodes
  283. def iterfind(elem, path, namespaces=None, with_prefixes=True):
  284. selector = _build_path_iterator(path, namespaces, with_prefixes=with_prefixes)
  285. result = iter((elem,))
  286. for select in selector:
  287. result = select(result)
  288. return result
  289. ##
  290. # Find first matching object.
  291. def find(elem, path, namespaces=None, with_prefixes=True):
  292. it = iterfind(elem, path, namespaces, with_prefixes=with_prefixes)
  293. try:
  294. return next(it)
  295. except StopIteration:
  296. return None
  297. ##
  298. # Find all matching objects.
  299. def findall(elem, path, namespaces=None, with_prefixes=True):
  300. return list(iterfind(elem, path, namespaces))
  301. ##
  302. # Find text for first matching object.
  303. def findtext(elem, path, default=None, namespaces=None, with_prefixes=True):
  304. el = find(elem, path, namespaces, with_prefixes=with_prefixes)
  305. if el is None:
  306. return default
  307. else:
  308. return el.text or ''