drv_libxml2.py 15 KB


  1. # -*- coding: iso-8859-1 -*-
  2. """ A SAX2 driver for libxml2, on top of it's XmlReader API
  3. USAGE
  4. # put this file (drv_libxml2.py) in PYTHONPATH
  5. import xml.sax
  6. reader = xml.sax.make_parser(["drv_libxml2"])
  7. # ...and the rest is standard python sax.
  8. CAVEATS
  9. - Lexical handlers are supported, except for start/endEntity
  10. (waiting for XmlReader.ResolveEntity) and start/endDTD
  11. - Error callbacks are not exactly synchronous, they tend
  12. to be invoked before the corresponding content callback,
  13. because the underlying reader interface parses
  14. data by chunks of 512 bytes
  15. TODO
  16. - search for TODO
  17. - some ErrorHandler events (warning)
  18. - some ContentHandler events (setDocumentLocator, skippedEntity)
  19. - EntityResolver (using libxml2.?)
  20. - DTDHandler (if/when libxml2 exposes such node types)
  21. - DeclHandler (if/when libxml2 exposes such node types)
  22. - property_xml_string?
  23. - feature_string_interning?
  24. - Incremental parser
  25. - additional performance tuning:
  26. - one might cache callbacks to avoid some name lookups
  27. - one might implement a smarter way to pass attributes to startElement
  28. (some kind of lazy evaluation?)
  29. - there might be room for improvement in start/endPrefixMapping
  30. - other?
  31. """
  32. __author__ = u"Stéphane Bidoul <sbi@skynet.be>"
  33. __version__ = "0.3"
  34. import codecs
  35. from types import StringType, UnicodeType
  36. StringTypes = (StringType,UnicodeType)
  37. from xml.sax._exceptions import *
  38. from xml.sax import xmlreader, saxutils
  39. from xml.sax.handler import \
  40. feature_namespaces, \
  41. feature_namespace_prefixes, \
  42. feature_string_interning, \
  43. feature_validation, \
  44. feature_external_ges, \
  45. feature_external_pes, \
  46. property_lexical_handler, \
  47. property_declaration_handler, \
  48. property_dom_node, \
  49. property_xml_string
  50. # libxml2 returns strings as UTF8
  51. _decoder = codecs.lookup("utf8")[1]
  52. def _d(s):
  53. if s is None:
  54. return s
  55. else:
  56. return _decoder(s)[0]
  57. try:
  58. import libxml2
  59. except ImportError, e:
  60. raise SAXReaderNotAvailable("libxml2 not available: " \
  61. "import error was: %s" % e)
  62. class Locator(xmlreader.Locator):
  63. """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
  64. def __init__(self,locator):
  65. self.__locator = locator
  66. def getColumnNumber(self):
  67. "Return the column number where the current event ends."
  68. return -1
  69. def getLineNumber(self):
  70. "Return the line number where the current event ends."
  71. return self.__locator.LineNumber()
  72. def getPublicId(self):
  73. "Return the public identifier for the current event."
  74. return None
  75. def getSystemId(self):
  76. "Return the system identifier for the current event."
  77. return self.__locator.BaseURI()
  78. class LibXml2Reader(xmlreader.XMLReader):
  79. def __init__(self):
  80. xmlreader.XMLReader.__init__(self)
  81. # features
  82. self.__ns = 0
  83. self.__nspfx = 0
  84. self.__validate = 0
  85. self.__extparams = 1
  86. # parsing flag
  87. self.__parsing = 0
  88. # additional handlers
  89. self.__lex_handler = None
  90. self.__decl_handler = None
  91. # error messages accumulator
  92. self.__errors = None
  93. def _errorHandler(self,arg,msg,severity,locator):
  94. if self.__errors is None:
  95. self.__errors = []
  96. self.__errors.append((severity,
  97. SAXParseException(msg,None,
  98. Locator(locator))))
  99. def _reportErrors(self,fatal):
  100. for severity,exception in self.__errors:
  101. if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
  102. libxml2.PARSER_SEVERITY_WARNING):
  103. self._err_handler.warning(exception)
  104. else:
  105. # when fatal is set, the parse will stop;
  106. # we consider that the last error reported
  107. # is the fatal one.
  108. if fatal and exception is self.__errors[-1][1]:
  109. self._err_handler.fatalError(exception)
  110. else:
  111. self._err_handler.error(exception)
  112. self.__errors = None
  113. def parse(self, source):
  114. self.__parsing = 1
  115. try:
  116. # prepare source and create reader
  117. if type(source) in StringTypes:
  118. reader = libxml2.newTextReaderFilename(source)
  119. else:
  120. source = saxutils.prepare_input_source(source)
  121. input = libxml2.inputBuffer(source.getByteStream())
  122. reader = input.newTextReader(source.getSystemId())
  123. reader.SetErrorHandler(self._errorHandler,None)
  124. # configure reader
  125. if self.__extparams:
  126. reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
  127. reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
  128. reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
  129. reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
  130. else:
  131. reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
  132. # we reuse attribute maps (for a slight performance gain)
  133. if self.__ns:
  134. attributesNSImpl = xmlreader.AttributesNSImpl({},{})
  135. else:
  136. attributesImpl = xmlreader.AttributesImpl({})
  137. # prefixes to pop (for endPrefixMapping)
  138. prefixes = []
  139. # start loop
  140. self._cont_handler.startDocument()
  141. while 1:
  142. r = reader.Read()
  143. # check for errors
  144. if r == 1:
  145. if not self.__errors is None:
  146. self._reportErrors(0)
  147. elif r == 0:
  148. if not self.__errors is None:
  149. self._reportErrors(0)
  150. break # end of parse
  151. else:
  152. if not self.__errors is None:
  153. self._reportErrors(1)
  154. else:
  155. self._err_handler.fatalError(\
  156. SAXException("Read failed (no details available)"))
  157. break # fatal parse error
  158. # get node type
  159. nodeType = reader.NodeType()
  160. # Element
  161. if nodeType == 1:
  162. if self.__ns:
  163. eltName = (_d(reader.NamespaceUri()),\
  164. _d(reader.LocalName()))
  165. eltQName = _d(reader.Name())
  166. attributesNSImpl._attrs = attrs = {}
  167. attributesNSImpl._qnames = qnames = {}
  168. newPrefixes = []
  169. while reader.MoveToNextAttribute():
  170. qname = _d(reader.Name())
  171. value = _d(reader.Value())
  172. if qname.startswith("xmlns"):
  173. if len(qname) > 5:
  174. newPrefix = qname[6:]
  175. else:
  176. newPrefix = None
  177. newPrefixes.append(newPrefix)
  178. self._cont_handler.startPrefixMapping(\
  179. newPrefix,value)
  180. if not self.__nspfx:
  181. continue # don't report xmlns attribute
  182. attName = (_d(reader.NamespaceUri()),
  183. _d(reader.LocalName()))
  184. qnames[attName] = qname
  185. attrs[attName] = value
  186. reader.MoveToElement()
  187. self._cont_handler.startElementNS( \
  188. eltName,eltQName,attributesNSImpl)
  189. if reader.IsEmptyElement():
  190. self._cont_handler.endElementNS(eltName,eltQName)
  191. for newPrefix in newPrefixes:
  192. self._cont_handler.endPrefixMapping(newPrefix)
  193. else:
  194. prefixes.append(newPrefixes)
  195. else:
  196. eltName = _d(reader.Name())
  197. attributesImpl._attrs = attrs = {}
  198. while reader.MoveToNextAttribute():
  199. attName = _d(reader.Name())
  200. attrs[attName] = _d(reader.Value())
  201. reader.MoveToElement()
  202. self._cont_handler.startElement( \
  203. eltName,attributesImpl)
  204. if reader.IsEmptyElement():
  205. self._cont_handler.endElement(eltName)
  206. # EndElement
  207. elif nodeType == 15:
  208. if self.__ns:
  209. self._cont_handler.endElementNS( \
  210. (_d(reader.NamespaceUri()),_d(reader.LocalName())),
  211. _d(reader.Name()))
  212. for prefix in prefixes.pop():
  213. self._cont_handler.endPrefixMapping(prefix)
  214. else:
  215. self._cont_handler.endElement(_d(reader.Name()))
  216. # Text
  217. elif nodeType == 3:
  218. self._cont_handler.characters(_d(reader.Value()))
  219. # Whitespace
  220. elif nodeType == 13:
  221. self._cont_handler.ignorableWhitespace(_d(reader.Value()))
  222. # SignificantWhitespace
  223. elif nodeType == 14:
  224. self._cont_handler.characters(_d(reader.Value()))
  225. # CDATA
  226. elif nodeType == 4:
  227. if not self.__lex_handler is None:
  228. self.__lex_handler.startCDATA()
  229. self._cont_handler.characters(_d(reader.Value()))
  230. if not self.__lex_handler is None:
  231. self.__lex_handler.endCDATA()
  232. # EntityReference
  233. elif nodeType == 5:
  234. if not self.__lex_handler is None:
  235. self.startEntity(_d(reader.Name()))
  236. reader.ResolveEntity()
  237. # EndEntity
  238. elif nodeType == 16:
  239. if not self.__lex_handler is None:
  240. self.endEntity(_d(reader.Name()))
  241. # ProcessingInstruction
  242. elif nodeType == 7:
  243. self._cont_handler.processingInstruction( \
  244. _d(reader.Name()),_d(reader.Value()))
  245. # Comment
  246. elif nodeType == 8:
  247. if not self.__lex_handler is None:
  248. self.__lex_handler.comment(_d(reader.Value()))
  249. # DocumentType
  250. elif nodeType == 10:
  251. #if not self.__lex_handler is None:
  252. # self.__lex_handler.startDTD()
  253. pass # TODO (how to detect endDTD? on first non-dtd event?)
  254. # XmlDeclaration
  255. elif nodeType == 17:
  256. pass # TODO
  257. # Entity
  258. elif nodeType == 6:
  259. pass # TODO (entity decl)
  260. # Notation (decl)
  261. elif nodeType == 12:
  262. pass # TODO
  263. # Attribute (never in this loop)
  264. #elif nodeType == 2:
  265. # pass
  266. # Document (not exposed)
  267. #elif nodeType == 9:
  268. # pass
  269. # DocumentFragment (never returned by XmlReader)
  270. #elif nodeType == 11:
  271. # pass
  272. # None
  273. #elif nodeType == 0:
  274. # pass
  275. # -
  276. else:
  277. raise SAXException("Unexpected node type %d" % nodeType)
  278. if r == 0:
  279. self._cont_handler.endDocument()
  280. reader.Close()
  281. finally:
  282. self.__parsing = 0
  283. def setDTDHandler(self, handler):
  284. # TODO (when supported, the inherited method works just fine)
  285. raise SAXNotSupportedException("DTDHandler not supported")
  286. def setEntityResolver(self, resolver):
  287. # TODO (when supported, the inherited method works just fine)
  288. raise SAXNotSupportedException("EntityResolver not supported")
  289. def getFeature(self, name):
  290. if name == feature_namespaces:
  291. return self.__ns
  292. elif name == feature_namespace_prefixes:
  293. return self.__nspfx
  294. elif name == feature_validation:
  295. return self.__validate
  296. elif name == feature_external_ges:
  297. return 1 # TODO (does that relate to PARSER_LOADDTD)?
  298. elif name == feature_external_pes:
  299. return self.__extparams
  300. else:
  301. raise SAXNotRecognizedException("Feature '%s' not recognized" % \
  302. name)
  303. def setFeature(self, name, state):
  304. if self.__parsing:
  305. raise SAXNotSupportedException("Cannot set feature %s " \
  306. "while parsing" % name)
  307. if name == feature_namespaces:
  308. self.__ns = state
  309. elif name == feature_namespace_prefixes:
  310. self.__nspfx = state
  311. elif name == feature_validation:
  312. self.__validate = state
  313. elif name == feature_external_ges:
  314. if state == 0:
  315. # TODO (does that relate to PARSER_LOADDTD)?
  316. raise SAXNotSupportedException("Feature '%s' not supported" % \
  317. name)
  318. elif name == feature_external_pes:
  319. self.__extparams = state
  320. else:
  321. raise SAXNotRecognizedException("Feature '%s' not recognized" % \
  322. name)
  323. def getProperty(self, name):
  324. if name == property_lexical_handler:
  325. return self.__lex_handler
  326. elif name == property_declaration_handler:
  327. return self.__decl_handler
  328. else:
  329. raise SAXNotRecognizedException("Property '%s' not recognized" % \
  330. name)
  331. def setProperty(self, name, value):
  332. if name == property_lexical_handler:
  333. self.__lex_handler = value
  334. elif name == property_declaration_handler:
  335. # TODO: remove if/when libxml2 supports dtd events
  336. raise SAXNotSupportedException("Property '%s' not supported" % \
  337. name)
  338. self.__decl_handler = value
  339. else:
  340. raise SAXNotRecognizedException("Property '%s' not recognized" % \
  341. name)
  342. def create_parser():
  343. return LibXml2Reader()