123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371 |
- # -*- coding: iso-8859-1 -*-
- """ A SAX2 driver for libxml2, on top of it's XmlReader API
- USAGE
- # put this file (drv_libxml2.py) in PYTHONPATH
- import xml.sax
- reader = xml.sax.make_parser(["drv_libxml2"])
- # ...and the rest is standard python sax.
- CAVEATS
- - Lexical handlers are supported, except for start/endEntity
- (waiting for XmlReader.ResolveEntity) and start/endDTD
- - Error callbacks are not exactly synchronous, they tend
- to be invoked before the corresponding content callback,
- because the underlying reader interface parses
- data by chunks of 512 bytes
-
- TODO
- - search for TODO
- - some ErrorHandler events (warning)
- - some ContentHandler events (setDocumentLocator, skippedEntity)
- - EntityResolver (using libxml2.?)
- - DTDHandler (if/when libxml2 exposes such node types)
- - DeclHandler (if/when libxml2 exposes such node types)
- - property_xml_string?
- - feature_string_interning?
- - Incremental parser
- - additional performance tuning:
- - one might cache callbacks to avoid some name lookups
- - one might implement a smarter way to pass attributes to startElement
- (some kind of lazy evaluation?)
- - there might be room for improvement in start/endPrefixMapping
- - other?
- """
- __author__ = u"Stéphane Bidoul <sbi@skynet.be>"
- __version__ = "0.3"
- import codecs
- from types import StringType, UnicodeType
- StringTypes = (StringType,UnicodeType)
- from xml.sax._exceptions import *
- from xml.sax import xmlreader, saxutils
- from xml.sax.handler import \
- feature_namespaces, \
- feature_namespace_prefixes, \
- feature_string_interning, \
- feature_validation, \
- feature_external_ges, \
- feature_external_pes, \
- property_lexical_handler, \
- property_declaration_handler, \
- property_dom_node, \
- property_xml_string
- # libxml2 returns strings as UTF8
- _decoder = codecs.lookup("utf8")[1]
- def _d(s):
- if s is None:
- return s
- else:
- return _decoder(s)[0]
- try:
- import libxml2
- except ImportError, e:
- raise SAXReaderNotAvailable("libxml2 not available: " \
- "import error was: %s" % e)
- class Locator(xmlreader.Locator):
- """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
- def __init__(self,locator):
- self.__locator = locator
- def getColumnNumber(self):
- "Return the column number where the current event ends."
- return -1
- def getLineNumber(self):
- "Return the line number where the current event ends."
- return self.__locator.LineNumber()
- def getPublicId(self):
- "Return the public identifier for the current event."
- return None
- def getSystemId(self):
- "Return the system identifier for the current event."
- return self.__locator.BaseURI()
- class LibXml2Reader(xmlreader.XMLReader):
- def __init__(self):
- xmlreader.XMLReader.__init__(self)
- # features
- self.__ns = 0
- self.__nspfx = 0
- self.__validate = 0
- self.__extparams = 1
- # parsing flag
- self.__parsing = 0
- # additional handlers
- self.__lex_handler = None
- self.__decl_handler = None
- # error messages accumulator
- self.__errors = None
- def _errorHandler(self,arg,msg,severity,locator):
- if self.__errors is None:
- self.__errors = []
- self.__errors.append((severity,
- SAXParseException(msg,None,
- Locator(locator))))
- def _reportErrors(self,fatal):
- for severity,exception in self.__errors:
- if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
- libxml2.PARSER_SEVERITY_WARNING):
- self._err_handler.warning(exception)
- else:
- # when fatal is set, the parse will stop;
- # we consider that the last error reported
- # is the fatal one.
- if fatal and exception is self.__errors[-1][1]:
- self._err_handler.fatalError(exception)
- else:
- self._err_handler.error(exception)
- self.__errors = None
- def parse(self, source):
- self.__parsing = 1
- try:
- # prepare source and create reader
- if type(source) in StringTypes:
- reader = libxml2.newTextReaderFilename(source)
- else:
- source = saxutils.prepare_input_source(source)
- input = libxml2.inputBuffer(source.getByteStream())
- reader = input.newTextReader(source.getSystemId())
- reader.SetErrorHandler(self._errorHandler,None)
- # configure reader
- if self.__extparams:
- reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
- reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
- reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
- reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
- else:
- reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
- # we reuse attribute maps (for a slight performance gain)
- if self.__ns:
- attributesNSImpl = xmlreader.AttributesNSImpl({},{})
- else:
- attributesImpl = xmlreader.AttributesImpl({})
- # prefixes to pop (for endPrefixMapping)
- prefixes = []
- # start loop
- self._cont_handler.startDocument()
- while 1:
- r = reader.Read()
- # check for errors
- if r == 1:
- if not self.__errors is None:
- self._reportErrors(0)
- elif r == 0:
- if not self.__errors is None:
- self._reportErrors(0)
- break # end of parse
- else:
- if not self.__errors is None:
- self._reportErrors(1)
- else:
- self._err_handler.fatalError(\
- SAXException("Read failed (no details available)"))
- break # fatal parse error
- # get node type
- nodeType = reader.NodeType()
- # Element
- if nodeType == 1:
- if self.__ns:
- eltName = (_d(reader.NamespaceUri()),\
- _d(reader.LocalName()))
- eltQName = _d(reader.Name())
- attributesNSImpl._attrs = attrs = {}
- attributesNSImpl._qnames = qnames = {}
- newPrefixes = []
- while reader.MoveToNextAttribute():
- qname = _d(reader.Name())
- value = _d(reader.Value())
- if qname.startswith("xmlns"):
- if len(qname) > 5:
- newPrefix = qname[6:]
- else:
- newPrefix = None
- newPrefixes.append(newPrefix)
- self._cont_handler.startPrefixMapping(\
- newPrefix,value)
- if not self.__nspfx:
- continue # don't report xmlns attribute
- attName = (_d(reader.NamespaceUri()),
- _d(reader.LocalName()))
- qnames[attName] = qname
- attrs[attName] = value
- reader.MoveToElement()
- self._cont_handler.startElementNS( \
- eltName,eltQName,attributesNSImpl)
- if reader.IsEmptyElement():
- self._cont_handler.endElementNS(eltName,eltQName)
- for newPrefix in newPrefixes:
- self._cont_handler.endPrefixMapping(newPrefix)
- else:
- prefixes.append(newPrefixes)
- else:
- eltName = _d(reader.Name())
- attributesImpl._attrs = attrs = {}
- while reader.MoveToNextAttribute():
- attName = _d(reader.Name())
- attrs[attName] = _d(reader.Value())
- reader.MoveToElement()
- self._cont_handler.startElement( \
- eltName,attributesImpl)
- if reader.IsEmptyElement():
- self._cont_handler.endElement(eltName)
- # EndElement
- elif nodeType == 15:
- if self.__ns:
- self._cont_handler.endElementNS( \
- (_d(reader.NamespaceUri()),_d(reader.LocalName())),
- _d(reader.Name()))
- for prefix in prefixes.pop():
- self._cont_handler.endPrefixMapping(prefix)
- else:
- self._cont_handler.endElement(_d(reader.Name()))
- # Text
- elif nodeType == 3:
- self._cont_handler.characters(_d(reader.Value()))
- # Whitespace
- elif nodeType == 13:
- self._cont_handler.ignorableWhitespace(_d(reader.Value()))
- # SignificantWhitespace
- elif nodeType == 14:
- self._cont_handler.characters(_d(reader.Value()))
- # CDATA
- elif nodeType == 4:
- if not self.__lex_handler is None:
- self.__lex_handler.startCDATA()
- self._cont_handler.characters(_d(reader.Value()))
- if not self.__lex_handler is None:
- self.__lex_handler.endCDATA()
- # EntityReference
- elif nodeType == 5:
- if not self.__lex_handler is None:
- self.startEntity(_d(reader.Name()))
- reader.ResolveEntity()
- # EndEntity
- elif nodeType == 16:
- if not self.__lex_handler is None:
- self.endEntity(_d(reader.Name()))
- # ProcessingInstruction
- elif nodeType == 7:
- self._cont_handler.processingInstruction( \
- _d(reader.Name()),_d(reader.Value()))
- # Comment
- elif nodeType == 8:
- if not self.__lex_handler is None:
- self.__lex_handler.comment(_d(reader.Value()))
- # DocumentType
- elif nodeType == 10:
- #if not self.__lex_handler is None:
- # self.__lex_handler.startDTD()
- pass # TODO (how to detect endDTD? on first non-dtd event?)
- # XmlDeclaration
- elif nodeType == 17:
- pass # TODO
- # Entity
- elif nodeType == 6:
- pass # TODO (entity decl)
- # Notation (decl)
- elif nodeType == 12:
- pass # TODO
- # Attribute (never in this loop)
- #elif nodeType == 2:
- # pass
- # Document (not exposed)
- #elif nodeType == 9:
- # pass
- # DocumentFragment (never returned by XmlReader)
- #elif nodeType == 11:
- # pass
- # None
- #elif nodeType == 0:
- # pass
- # -
- else:
- raise SAXException("Unexpected node type %d" % nodeType)
- if r == 0:
- self._cont_handler.endDocument()
- reader.Close()
- finally:
- self.__parsing = 0
- def setDTDHandler(self, handler):
- # TODO (when supported, the inherited method works just fine)
- raise SAXNotSupportedException("DTDHandler not supported")
- def setEntityResolver(self, resolver):
- # TODO (when supported, the inherited method works just fine)
- raise SAXNotSupportedException("EntityResolver not supported")
- def getFeature(self, name):
- if name == feature_namespaces:
- return self.__ns
- elif name == feature_namespace_prefixes:
- return self.__nspfx
- elif name == feature_validation:
- return self.__validate
- elif name == feature_external_ges:
- return 1 # TODO (does that relate to PARSER_LOADDTD)?
- elif name == feature_external_pes:
- return self.__extparams
- else:
- raise SAXNotRecognizedException("Feature '%s' not recognized" % \
- name)
- def setFeature(self, name, state):
- if self.__parsing:
- raise SAXNotSupportedException("Cannot set feature %s " \
- "while parsing" % name)
- if name == feature_namespaces:
- self.__ns = state
- elif name == feature_namespace_prefixes:
- self.__nspfx = state
- elif name == feature_validation:
- self.__validate = state
- elif name == feature_external_ges:
- if state == 0:
- # TODO (does that relate to PARSER_LOADDTD)?
- raise SAXNotSupportedException("Feature '%s' not supported" % \
- name)
- elif name == feature_external_pes:
- self.__extparams = state
- else:
- raise SAXNotRecognizedException("Feature '%s' not recognized" % \
- name)
- def getProperty(self, name):
- if name == property_lexical_handler:
- return self.__lex_handler
- elif name == property_declaration_handler:
- return self.__decl_handler
- else:
- raise SAXNotRecognizedException("Property '%s' not recognized" % \
- name)
- def setProperty(self, name, value):
- if name == property_lexical_handler:
- self.__lex_handler = value
- elif name == property_declaration_handler:
- # TODO: remove if/when libxml2 supports dtd events
- raise SAXNotSupportedException("Property '%s' not supported" % \
- name)
- self.__decl_handler = value
- else:
- raise SAXNotRecognizedException("Property '%s' not recognized" % \
- name)
- def create_parser():
- return LibXml2Reader()
|