# Copyright (c) 2002, Daniel Krech, http://eikeon.com/ # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # # * Neither the name of Daniel Krech nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ """ from rdflib import RDF, RDFS from rdflib import URIRef, BNode, Literal from rdflib.exceptions import ParserError, Error from rdflib.syntax.xml_names import is_ncname from xml.sax.saxutils import handler, quoteattr, escape from urlparse import urljoin, urldefrag RDFNS = RDF.RDFNS # http://www.w3.org/TR/rdf-syntax-grammar/#eventterm-attribute-URI # A mapping from unqualified terms to there qualified version. UNQUALIFIED = {"about" : RDF.about, "ID" : RDF.ID, "type" : RDF.type, "resource": RDF.resource, "parseType": RDF.parseType} # http://www.w3.org/TR/rdf-syntax-grammar/#coreSyntaxTerms CORE_SYNTAX_TERMS = [RDF.RDF, RDF.ID, RDF.about, RDF.parseType, RDF.resource, RDF.nodeID, RDF.datatype] # http://www.w3.org/TR/rdf-syntax-grammar/#syntaxTerms SYNTAX_TERMS = CORE_SYNTAX_TERMS + [RDF.Description, RDF.li] # http://www.w3.org/TR/rdf-syntax-grammar/#oldTerms OLD_TERMS = [RDFNS["aboutEach"], RDFNS["aboutEachPrefix"], RDFNS["bagID"]] NODE_ELEMENT_EXCEPTIONS = CORE_SYNTAX_TERMS + [RDF.li,] + OLD_TERMS NODE_ELEMENT_ATTRIBUTES = [RDF.ID, RDF.nodeID, RDF.about] PROPERTY_ELEMENT_EXCEPTIONS = CORE_SYNTAX_TERMS + [RDF.Description,] + OLD_TERMS PROPERTY_ATTRIBUTE_EXCEPTIONS = CORE_SYNTAX_TERMS + [RDF.Description, RDF.li] + OLD_TERMS PROPERTY_ELEMENT_ATTRIBUTES = [RDF.ID, RDF.resource, RDF.nodeID] XMLNS = "http://www.w3.org/XML/1998/namespace" BASE = (XMLNS, "base") LANG = (XMLNS, "lang") class BagID(URIRef): __slots__ = ['li'] def __init__(self, val): super(URIRef, self).__init__(val) self.li = 0 def next_li(self): self.li += 1 return URIRef(RDFNS + "_%s" % self.li) class ElementHandler(object): __slots__ = ['start', 'char', 'end', 'li', 'id', 'base', 'subject', 'predicate', 'object', 'list', 'language', 'datatype', 'declared'] def __init__(self): self.start = None self.char = None self.end = None self.li = 0 self.id = None self.base = None self.subject = None self.object = None self.list = None self.language = "" self.datatype = "" self.declared = None def next_li(self): self.li += 1 return URIRef(RDFNS + "_%s" % self.li) class RDFXMLHandler(handler.ContentHandler): def __init__(self, store): self.store = store self.reset() def reset(self): document_element = ElementHandler() document_element.start = self.document_element_start document_element.end = lambda name, qname: None self.stack = [None, document_element,] self.ids = {} # remember IDs we have already seen self.bnode = {} self._ns_contexts = [{}] # contains uri -> prefix dicts self._current_context = self._ns_contexts[-1] # ContentHandler methods def setDocumentLocator(self, locator): self.locator = locator def startDocument(self): pass def startPrefixMapping(self, prefix, namespace): self._ns_contexts.append(self._current_context.copy()) self._current_context[namespace] = prefix self.store.bind(prefix, URIRef(namespace), override=False) def endPrefixMapping(self, prefix): self._current_context = self._ns_contexts[-1] del self._ns_contexts[-1] def startElementNS(self, name, qname, attrs): stack = self.stack stack.append(ElementHandler()) current = self.current parent = self.parent base = attrs.get(BASE, None) if base is not None: base, frag = urldefrag(base) else: if parent: base = parent.base if base is None: systemId = self.locator.getPublicId() or self.locator.getSystemId() if systemId: base, frag = urldefrag(systemId) current.base = base language = attrs.get(LANG, None) if language is None: if parent: language = parent.language else: language = '' current.language = language current.start(name, qname, attrs) def endElementNS(self, name, qname): self.current.end(name, qname) self.stack.pop() def characters(self, content): char = self.current.char if char: char(content) def ignorableWhitespace(self, content): pass def processingInstruction(self, target, data): pass def add_reified(self, sid, (s, p, o)): self.store.add((sid, RDF.type, RDF.Statement)) self.store.add((sid, RDF.subject, s)) self.store.add((sid, RDF.predicate, p)) self.store.add((sid, RDF.object, o)) def error(self, message): locator = self.locator info = "%s:%s:%s: " % (locator.getSystemId(), locator.getLineNumber(), locator.getColumnNumber()) raise ParserError(info + message) def get_current(self): return self.stack[-2] # Create a read only property called current so that self.current # give the current element handler. current = property(get_current) def get_next(self): return self.stack[-1] # Create a read only property that gives the element handler to be # used for the next element. next = property(get_next) def get_parent(self): return self.stack[-3] # Create a read only property that gives the current parent # element handler parent = property(get_parent) def absolutize(self, uri): result = urljoin(self.current.base, uri, allow_fragments=1) if uri and uri[-1]=="#" and result[-1]!="#": result = "%s#" % result return URIRef(result) def convert(self, name, qname, attrs): if name[0] is None: name = name[1] else: name = "".join(name) atts = {} for (n, v) in attrs.items(): #attrs._attrs.iteritems(): # if n[0] is None: att = n[1] else: att = "".join(n) if att.startswith(XMLNS) or att[0:3].lower()=="xml": pass elif att in UNQUALIFIED: #if not RDFNS[att] in atts: atts[RDFNS[att]] = v else: atts[att] = v return name, atts def document_element_start(self, name, qname, attrs): if name[0] and "".join(name) == RDF.RDF: next = self.next next.start = self.node_element_start next.end = self.node_element_end else: self.node_element_start(name, qname, attrs) #self.current.end = self.node_element_end # TODO... set end to something that sets start such that # another element will cause error def node_element_start(self, name, qname, attrs): name, atts = self.convert(name, qname, attrs) current = self.current absolutize = self.absolutize next = self.next next.start = self.property_element_start next.end = self.property_element_end if name in NODE_ELEMENT_EXCEPTIONS: self.error("Invalid node element URI: %s" % name) if RDF.ID in atts: if RDF.about in atts or RDF.nodeID in atts: self.error("Can have at most one of rdf:ID, rdf:about, and rdf:nodeID") id = atts[RDF.ID] if not is_ncname(id): self.error("rdf:ID value is not a valid NCName: %s" % id) subject = absolutize("#%s" % id) if subject in self.ids: self.error("two elements cannot use the same ID: '%s'" % subject) self.ids[subject] = 1 # IDs can only appear once within a document elif RDF.nodeID in atts: if RDF.ID in atts or RDF.about in atts: self.error("Can have at most one of rdf:ID, rdf:about, and rdf:nodeID") nodeID = atts[RDF.nodeID] if not is_ncname(nodeID): self.error("rdf:nodeID value is not a valid NCName: %s" % nodeID) if nodeID in self.bnode: subject = self.bnode[nodeID] else: subject = BNode() self.bnode[nodeID] = subject elif RDF.about in atts: if RDF.ID in atts or RDF.nodeID in atts: self.error("Can have at most one of rdf:ID, rdf:about, and rdf:nodeID") subject = absolutize(atts[RDF.about]) else: subject = BNode() if name!=RDF.Description: # S1 self.store.add((subject, RDF.type, absolutize(name))) language = current.language for att in atts: if not att.startswith(RDFNS): predicate = absolutize(att) try: object = Literal(atts[att], language) except Error, e: self.error(e.msg) elif att==RDF.type: #S2 predicate = RDF.type object = absolutize(atts[RDF.type]) elif att in NODE_ELEMENT_ATTRIBUTES: continue elif att in PROPERTY_ATTRIBUTE_EXCEPTIONS: #S3 self.error("Invalid property attribute URI: %s" % att) continue # for when error does not throw an exception else: predicate = absolutize(att) try: object = Literal(atts[att], language) except Error, e: self.error(e.msg) self.store.add((subject, predicate, object)) current.subject = subject def node_element_end(self, name, qname): self.parent.object = self.current.subject def property_element_start(self, name, qname, attrs): name, atts = self.convert(name, qname, attrs) current = self.current absolutize = self.absolutize next = self.next object = None current.list = None if not name.startswith(RDFNS): current.predicate = absolutize(name) elif name==RDF.li: current.predicate = current.next_li() elif name in PROPERTY_ELEMENT_EXCEPTIONS: self.error("Invalid property element URI: %s" % name) else: current.predicate = absolutize(name) id = atts.get(RDF.ID, None) if id is not None: if not is_ncname(id): self.error("rdf:ID value is not a value NCName: %s" % id) current.id = absolutize("#%s" % id) else: current.id = None resource = atts.get(RDF.resource, None) nodeID = atts.get(RDF.nodeID, None) parse_type = atts.get(RDF.parseType, None) if resource is not None and nodeID is not None: self.error("Property element cannot have both rdf:nodeID and rdf:resource") if resource is not None: object = absolutize(resource) next.start = self.node_element_start next.end = self.node_element_end elif nodeID is not None: if not is_ncname(nodeID): self.error("rdf:nodeID value is not a valid NCName: %s" % nodeID) if nodeID in self.bnode: object = self.bnode[nodeID] else: subject = BNode() self.bnode[nodeID] = subject object = subject next.start = self.node_element_start next.end = self.node_element_end else: if parse_type is not None: for att in atts: if att!=RDF.parseType and att!=RDF.ID: self.error("Property attr '%s' now allowed here" % att) if parse_type=="Resource": current.subject = object = BNode() current.char = self.property_element_char next.start = self.property_element_start next.end = self.property_element_end elif parse_type=="Collection": current.char = None object = current.list = RDF.nil #BNode()#self.parent.subject next.start = self.node_element_start next.end = self.list_node_element_end else: #if parse_type=="Literal": # All other values are treated as Literal # See: http://www.w3.org/TR/rdf-syntax-grammar/#parseTypeOtherPropertyElt object = Literal("", "", RDF.XMLLiteral) current.char = self.literal_element_char current.declared = {} next.start = self.literal_element_start next.char = self.literal_element_char next.end = self.literal_element_end current.object = object return else: object = None current.char = self.property_element_char next.start = self.node_element_start next.end = self.node_element_end datatype = atts.get(RDF.datatype, None) if datatype: datatype = URIRef(datatype) current.datatype = datatype language = current.language if datatype is not None: # TODO: check that there are no atts other than datatype and id pass else: for att in atts: if not att.startswith(RDFNS): predicate = absolutize(att) elif att in PROPERTY_ELEMENT_ATTRIBUTES: continue elif att in PROPERTY_ATTRIBUTE_EXCEPTIONS: self.error("""Invalid property attribute URI: %s""" % att) else: predicate = absolutize(att) if att==RDF.type: o = URIRef(atts[att]) else: o = Literal(atts[att], language, datatype) if object is None: object = BNode() self.store.add((object, predicate, o)) if object is None: object = Literal("", language, datatype) current.object = object def property_element_char(self, data): current = self.current if current.object is None: try: current.object = Literal(data, current.language, current.datatype) except Error, e: self.error(e.msg) else: if isinstance(current.object, Literal): try: current.object += data except Error, e: self.error(e.msg) def property_element_end(self, name, qname): current = self.current if self.next.end==self.list_node_element_end: if current.object!=RDF.nil: self.store.add((current.list, RDF.rest, RDF.nil)) if current.object is not None: self.store.add((self.parent.subject, current.predicate, current.object)) if current.id is not None: self.add_reified(current.id, (self.parent.subject, current.predicate, current.object)) current.subject = None def list_node_element_end(self, name, qname): current = self.current if self.parent.list==RDF.nil: list = BNode() # Removed between 20030123 and 20030905 #self.store.add((list, RDF.type, LIST)) self.parent.list = list self.store.add((self.parent.list, RDF.first, current.subject)) self.parent.object = list self.parent.char = None else: list = BNode() # Removed between 20030123 and 20030905 #self.store.add((list, RDF.type, LIST)) self.store.add((self.parent.list, RDF.rest, list)) self.store.add((list, RDF.first, current.subject)) self.parent.list = list def literal_element_start(self, name, qname, attrs): current = self.current self.next.start = self.literal_element_start self.next.char = self.literal_element_char self.next.end = self.literal_element_end current.declared = self.parent.declared.copy() if name[0]: prefix = self._current_context[name[0]] if prefix: current.object = "<%s:%s" % (prefix, name[1]) else: current.object = "<%s" % name[1] if not name[0] in current.declared: current.declared[name[0]] = prefix if prefix: current.object += (' xmlns:%s="%s"' % (prefix, name[0])) else: current.object += (' xmlns="%s"' % name[0]) else: current.object = "<%s" % name[1] for (name, value) in attrs.items(): if name[0]: if not name[0] in current.declared: current.declared[name[0]] = self._current_context[name[0]] name = current.declared[name[0]] + ":" + name[1] else: name = name[1] current.object += (' %s=%s' % (name, quoteattr(value))) current.object += ">" def literal_element_char(self, data): self.current.object += escape(data) def literal_element_end(self, name, qname): if name[0]: prefix = self._current_context[name[0]] if prefix: end = u"</%s:%s>" % (prefix, name[1]) else: end = u"</%s>" % name[1] else: end = u"</%s>" % name[1] self.parent.object += self.current.object + end