您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 
 
 

480 行
15 KiB

  1. # support for DTD validation
  2. from lxml.includes cimport dtdvalid
  3. cdef class DTDError(LxmlError):
  4. """Base class for DTD errors.
  5. """
  6. cdef class DTDParseError(DTDError):
  7. """Error while parsing a DTD.
  8. """
  9. cdef class DTDValidateError(DTDError):
  10. """Error while validating an XML document with a DTD.
  11. """
  12. cdef inline int _assertValidDTDNode(node, void *c_node) except -1:
  13. assert c_node is not NULL, "invalid DTD proxy at %s" % id(node)
  14. @cython.final
  15. @cython.internal
  16. @cython.freelist(8)
  17. cdef class _DTDElementContentDecl:
  18. cdef DTD _dtd
  19. cdef tree.xmlElementContent* _c_node
  20. def __repr__(self):
  21. return "<%s.%s object name=%r type=%r occur=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.type, self.occur, id(self))
  22. @property
  23. def name(self):
  24. _assertValidDTDNode(self, self._c_node)
  25. return funicodeOrNone(self._c_node.name)
  26. @property
  27. def type(self):
  28. _assertValidDTDNode(self, self._c_node)
  29. cdef int type = self._c_node.type
  30. if type == tree.XML_ELEMENT_CONTENT_PCDATA:
  31. return "pcdata"
  32. elif type == tree.XML_ELEMENT_CONTENT_ELEMENT:
  33. return "element"
  34. elif type == tree.XML_ELEMENT_CONTENT_SEQ:
  35. return "seq"
  36. elif type == tree.XML_ELEMENT_CONTENT_OR:
  37. return "or"
  38. else:
  39. return None
  40. @property
  41. def occur(self):
  42. _assertValidDTDNode(self, self._c_node)
  43. cdef int occur = self._c_node.ocur
  44. if occur == tree.XML_ELEMENT_CONTENT_ONCE:
  45. return "once"
  46. elif occur == tree.XML_ELEMENT_CONTENT_OPT:
  47. return "opt"
  48. elif occur == tree.XML_ELEMENT_CONTENT_MULT:
  49. return "mult"
  50. elif occur == tree.XML_ELEMENT_CONTENT_PLUS:
  51. return "plus"
  52. else:
  53. return None
  54. @property
  55. def left(self):
  56. _assertValidDTDNode(self, self._c_node)
  57. c1 = self._c_node.c1
  58. if c1:
  59. node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl)
  60. node._dtd = self._dtd
  61. node._c_node = <tree.xmlElementContent*>c1
  62. return node
  63. else:
  64. return None
  65. @property
  66. def right(self):
  67. _assertValidDTDNode(self, self._c_node)
  68. c2 = self._c_node.c2
  69. if c2:
  70. node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl)
  71. node._dtd = self._dtd
  72. node._c_node = <tree.xmlElementContent*>c2
  73. return node
  74. else:
  75. return None
  76. @cython.final
  77. @cython.internal
  78. @cython.freelist(8)
  79. cdef class _DTDAttributeDecl:
  80. cdef DTD _dtd
  81. cdef tree.xmlAttribute* _c_node
  82. def __repr__(self):
  83. return "<%s.%s object name=%r elemname=%r prefix=%r type=%r default=%r default_value=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.elemname, self.prefix, self.type, self.default, self.default_value, id(self))
  84. @property
  85. def name(self):
  86. _assertValidDTDNode(self, self._c_node)
  87. return funicodeOrNone(self._c_node.name)
  88. @property
  89. def elemname(self):
  90. _assertValidDTDNode(self, self._c_node)
  91. return funicodeOrNone(self._c_node.elem)
  92. @property
  93. def prefix(self):
  94. _assertValidDTDNode(self, self._c_node)
  95. return funicodeOrNone(self._c_node.prefix)
  96. @property
  97. def type(self):
  98. _assertValidDTDNode(self, self._c_node)
  99. cdef int type = self._c_node.atype
  100. if type == tree.XML_ATTRIBUTE_CDATA:
  101. return "cdata"
  102. elif type == tree.XML_ATTRIBUTE_ID:
  103. return "id"
  104. elif type == tree.XML_ATTRIBUTE_IDREF:
  105. return "idref"
  106. elif type == tree.XML_ATTRIBUTE_IDREFS:
  107. return "idrefs"
  108. elif type == tree.XML_ATTRIBUTE_ENTITY:
  109. return "entity"
  110. elif type == tree.XML_ATTRIBUTE_ENTITIES:
  111. return "entities"
  112. elif type == tree.XML_ATTRIBUTE_NMTOKEN:
  113. return "nmtoken"
  114. elif type == tree.XML_ATTRIBUTE_NMTOKENS:
  115. return "nmtokens"
  116. elif type == tree.XML_ATTRIBUTE_ENUMERATION:
  117. return "enumeration"
  118. elif type == tree.XML_ATTRIBUTE_NOTATION:
  119. return "notation"
  120. else:
  121. return None
  122. @property
  123. def default(self):
  124. _assertValidDTDNode(self, self._c_node)
  125. cdef int default = self._c_node.def_
  126. if default == tree.XML_ATTRIBUTE_NONE:
  127. return "none"
  128. elif default == tree.XML_ATTRIBUTE_REQUIRED:
  129. return "required"
  130. elif default == tree.XML_ATTRIBUTE_IMPLIED:
  131. return "implied"
  132. elif default == tree.XML_ATTRIBUTE_FIXED:
  133. return "fixed"
  134. else:
  135. return None
  136. @property
  137. def default_value(self):
  138. _assertValidDTDNode(self, self._c_node)
  139. return funicodeOrNone(self._c_node.defaultValue)
  140. def itervalues(self):
  141. _assertValidDTDNode(self, self._c_node)
  142. cdef tree.xmlEnumeration *c_node = self._c_node.tree
  143. while c_node is not NULL:
  144. yield funicode(c_node.name)
  145. c_node = c_node.next
  146. def values(self):
  147. return list(self.itervalues())
  148. @cython.final
  149. @cython.internal
  150. @cython.freelist(8)
  151. cdef class _DTDElementDecl:
  152. cdef DTD _dtd
  153. cdef tree.xmlElement* _c_node
  154. def __repr__(self):
  155. return "<%s.%s object name=%r prefix=%r type=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.prefix, self.type, id(self))
  156. @property
  157. def name(self):
  158. _assertValidDTDNode(self, self._c_node)
  159. return funicodeOrNone(self._c_node.name)
  160. @property
  161. def prefix(self):
  162. _assertValidDTDNode(self, self._c_node)
  163. return funicodeOrNone(self._c_node.prefix)
  164. @property
  165. def type(self):
  166. _assertValidDTDNode(self, self._c_node)
  167. cdef int type = self._c_node.etype
  168. if type == tree.XML_ELEMENT_TYPE_UNDEFINED:
  169. return "undefined"
  170. elif type == tree.XML_ELEMENT_TYPE_EMPTY:
  171. return "empty"
  172. elif type == tree.XML_ELEMENT_TYPE_ANY:
  173. return "any"
  174. elif type == tree.XML_ELEMENT_TYPE_MIXED:
  175. return "mixed"
  176. elif type == tree.XML_ELEMENT_TYPE_ELEMENT:
  177. return "element"
  178. else:
  179. return None
  180. @property
  181. def content(self):
  182. _assertValidDTDNode(self, self._c_node)
  183. cdef tree.xmlElementContent *content = self._c_node.content
  184. if content:
  185. node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl)
  186. node._dtd = self._dtd
  187. node._c_node = content
  188. return node
  189. else:
  190. return None
  191. def iterattributes(self):
  192. _assertValidDTDNode(self, self._c_node)
  193. cdef tree.xmlAttribute *c_node = self._c_node.attributes
  194. while c_node:
  195. node = <_DTDAttributeDecl>_DTDAttributeDecl.__new__(_DTDAttributeDecl)
  196. node._dtd = self._dtd
  197. node._c_node = c_node
  198. yield node
  199. c_node = c_node.nexth
  200. def attributes(self):
  201. return list(self.iterattributes())
  202. @cython.final
  203. @cython.internal
  204. @cython.freelist(8)
  205. cdef class _DTDEntityDecl:
  206. cdef DTD _dtd
  207. cdef tree.xmlEntity* _c_node
  208. def __repr__(self):
  209. return "<%s.%s object name=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
  210. @property
  211. def name(self):
  212. _assertValidDTDNode(self, self._c_node)
  213. return funicodeOrNone(self._c_node.name)
  214. @property
  215. def orig(self):
  216. _assertValidDTDNode(self, self._c_node)
  217. return funicodeOrNone(self._c_node.orig)
  218. @property
  219. def content(self):
  220. _assertValidDTDNode(self, self._c_node)
  221. return funicodeOrNone(self._c_node.content)
  222. @property
  223. def system_url(self):
  224. _assertValidDTDNode(self, self._c_node)
  225. return funicodeOrNone(self._c_node.SystemID)
  226. ################################################################################
  227. # DTD
  228. cdef class DTD(_Validator):
  229. """DTD(self, file=None, external_id=None)
  230. A DTD validator.
  231. Can load from filesystem directly given a filename or file-like object.
  232. Alternatively, pass the keyword parameter ``external_id`` to load from a
  233. catalog.
  234. """
  235. cdef tree.xmlDtd* _c_dtd
  236. def __init__(self, file=None, *, external_id=None):
  237. _Validator.__init__(self)
  238. if file is not None:
  239. file = _getFSPathOrObject(file)
  240. if _isString(file):
  241. file = _encodeFilename(file)
  242. with self._error_log:
  243. orig_loader = _register_document_loader()
  244. self._c_dtd = xmlparser.xmlParseDTD(NULL, _xcstr(file))
  245. _reset_document_loader(orig_loader)
  246. elif hasattr(file, 'read'):
  247. orig_loader = _register_document_loader()
  248. self._c_dtd = _parseDtdFromFilelike(file)
  249. _reset_document_loader(orig_loader)
  250. else:
  251. raise DTDParseError, "file must be a filename, file-like or path-like object"
  252. elif external_id is not None:
  253. external_id_utf = _utf8(external_id)
  254. with self._error_log:
  255. orig_loader = _register_document_loader()
  256. self._c_dtd = xmlparser.xmlParseDTD(<const_xmlChar*>external_id_utf, NULL)
  257. _reset_document_loader(orig_loader)
  258. else:
  259. raise DTDParseError, "either filename or external ID required"
  260. if self._c_dtd is NULL:
  261. raise DTDParseError(
  262. self._error_log._buildExceptionMessage("error parsing DTD"),
  263. self._error_log)
  264. @property
  265. def name(self):
  266. if self._c_dtd is NULL:
  267. return None
  268. return funicodeOrNone(self._c_dtd.name)
  269. @property
  270. def external_id(self):
  271. if self._c_dtd is NULL:
  272. return None
  273. return funicodeOrNone(self._c_dtd.ExternalID)
  274. @property
  275. def system_url(self):
  276. if self._c_dtd is NULL:
  277. return None
  278. return funicodeOrNone(self._c_dtd.SystemID)
  279. def iterelements(self):
  280. cdef tree.xmlNode *c_node = self._c_dtd.children if self._c_dtd is not NULL else NULL
  281. while c_node is not NULL:
  282. if c_node.type == tree.XML_ELEMENT_DECL:
  283. node = _DTDElementDecl()
  284. node._dtd = self
  285. node._c_node = <tree.xmlElement*>c_node
  286. yield node
  287. c_node = c_node.next
  288. def elements(self):
  289. return list(self.iterelements())
  290. def iterentities(self):
  291. cdef tree.xmlNode *c_node = self._c_dtd.children if self._c_dtd is not NULL else NULL
  292. while c_node is not NULL:
  293. if c_node.type == tree.XML_ENTITY_DECL:
  294. node = _DTDEntityDecl()
  295. node._dtd = self
  296. node._c_node = <tree.xmlEntity*>c_node
  297. yield node
  298. c_node = c_node.next
  299. def entities(self):
  300. return list(self.iterentities())
  301. def __dealloc__(self):
  302. tree.xmlFreeDtd(self._c_dtd)
  303. def __call__(self, etree):
  304. """__call__(self, etree)
  305. Validate doc using the DTD.
  306. Returns true if the document is valid, false if not.
  307. """
  308. cdef _Document doc
  309. cdef _Element root_node
  310. cdef xmlDoc* c_doc
  311. cdef dtdvalid.xmlValidCtxt* valid_ctxt
  312. cdef int ret = -1
  313. assert self._c_dtd is not NULL, "DTD not initialised"
  314. doc = _documentOrRaise(etree)
  315. root_node = _rootNodeOrRaise(etree)
  316. valid_ctxt = dtdvalid.xmlNewValidCtxt()
  317. if valid_ctxt is NULL:
  318. raise DTDError("Failed to create validation context")
  319. # work around error reporting bug in libxml2 <= 2.9.1 (and later?)
  320. # https://bugzilla.gnome.org/show_bug.cgi?id=724903
  321. valid_ctxt.error = <dtdvalid.xmlValidityErrorFunc>_nullGenericErrorFunc
  322. valid_ctxt.userData = NULL
  323. try:
  324. with self._error_log:
  325. c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node)
  326. ret = dtdvalid.xmlValidateDtd(valid_ctxt, c_doc, self._c_dtd)
  327. _destroyFakeDoc(doc._c_doc, c_doc)
  328. finally:
  329. dtdvalid.xmlFreeValidCtxt(valid_ctxt)
  330. if ret == -1:
  331. raise DTDValidateError("Internal error in DTD validation",
  332. self._error_log)
  333. return ret == 1
  334. cdef tree.xmlDtd* _parseDtdFromFilelike(file) except NULL:
  335. cdef _ExceptionContext exc_context
  336. cdef _FileReaderContext dtd_parser
  337. cdef _ErrorLog error_log
  338. cdef tree.xmlDtd* c_dtd = NULL
  339. exc_context = _ExceptionContext()
  340. dtd_parser = _FileReaderContext(file, exc_context, None)
  341. error_log = _ErrorLog()
  342. with error_log:
  343. c_dtd = dtd_parser._readDtd()
  344. exc_context._raise_if_stored()
  345. if c_dtd is NULL:
  346. raise DTDParseError("error parsing DTD", error_log)
  347. return c_dtd
  348. cdef DTD _dtdFactory(tree.xmlDtd* c_dtd):
  349. # do not run through DTD.__init__()!
  350. cdef DTD dtd
  351. if c_dtd is NULL:
  352. return None
  353. dtd = DTD.__new__(DTD)
  354. dtd._c_dtd = _copyDtd(c_dtd)
  355. _Validator.__init__(dtd)
  356. return dtd
  357. cdef tree.xmlDtd* _copyDtd(tree.xmlDtd* c_orig_dtd) except NULL:
  358. """
  359. Copy a DTD. libxml2 (currently) fails to set up the element->attributes
  360. links when copying DTDs, so we have to rebuild them here.
  361. """
  362. c_dtd = tree.xmlCopyDtd(c_orig_dtd)
  363. if not c_dtd:
  364. raise MemoryError
  365. cdef tree.xmlNode* c_node = c_dtd.children
  366. while c_node:
  367. if c_node.type == tree.XML_ATTRIBUTE_DECL:
  368. _linkDtdAttribute(c_dtd, <tree.xmlAttribute*>c_node)
  369. c_node = c_node.next
  370. return c_dtd
  371. cdef void _linkDtdAttribute(tree.xmlDtd* c_dtd, tree.xmlAttribute* c_attr) noexcept:
  372. """
  373. Create the link to the DTD attribute declaration from the corresponding
  374. element declaration.
  375. """
  376. c_elem = dtdvalid.xmlGetDtdElementDesc(c_dtd, c_attr.elem)
  377. if not c_elem:
  378. # no such element? something is wrong with the DTD ...
  379. return
  380. c_pos = c_elem.attributes
  381. if not c_pos:
  382. c_elem.attributes = c_attr
  383. c_attr.nexth = NULL
  384. return
  385. # libxml2 keeps namespace declarations first, and we need to make
  386. # sure we don't re-insert attributes that are already there
  387. if _isDtdNsDecl(c_attr):
  388. if not _isDtdNsDecl(c_pos):
  389. c_elem.attributes = c_attr
  390. c_attr.nexth = c_pos
  391. return
  392. while c_pos != c_attr and c_pos.nexth and _isDtdNsDecl(c_pos.nexth):
  393. c_pos = c_pos.nexth
  394. else:
  395. # append at end
  396. while c_pos != c_attr and c_pos.nexth:
  397. c_pos = c_pos.nexth
  398. if c_pos == c_attr:
  399. return
  400. c_attr.nexth = c_pos.nexth
  401. c_pos.nexth = c_attr
  402. cdef bint _isDtdNsDecl(tree.xmlAttribute* c_attr) noexcept:
  403. if cstring_h.strcmp(<const_char*>c_attr.name, "xmlns") == 0:
  404. return True
  405. if (c_attr.prefix is not NULL and
  406. cstring_h.strcmp(<const_char*>c_attr.prefix, "xmlns") == 0):
  407. return True
  408. return False