import re # # NB: requires Elementtree (or interface-compatible) library # __all__ = [ "shortcuts", "ns", "dexml" ] shortcuts = { "rdf" : "http://www.w3.org/1999/02/22-rdf-syntax-ns#", "dc" : "http://purl.org/dc/elements/1.1/", "atom" : "http://www.w3.org/2005/Atom", "foaf" : "http://xmlns.com/foaf/0.1/", } try: import web except: class web: @classmethod def Storage(cls, **params): return dict(**params) def ns(tag, shortcut=""): ''' Wraps given tag into ET-compatible namespace description. becomes "{http://www.w3.org/2005/Atom}title" Examples: >>> absxml.ns("atom:title") >>> absxml.ns("title", "atom") >>> absxml.ns("somecustomtag", "unknownnamespace") ''' for s in shortcuts.items(): tag = tag.replace( s[0]+":", "{"+s[1]+"}" ) if shortcuts.has_key(shortcut): return "{"+shortcuts[shortcut]+"}"+tag else: return tag def dexml_strip(t, dostrip=False): if dostrip: return re.sub(r".+:","",t) else: return t def dexml( E, ruleset, shortcut="", none = None, strip_shortcuts = False): ''' Converts Elementtree node into native pythonic dict() according to given ruleset. Provides web.Storage() object if available . Ruleset is a dict( = ) is: - some XPath, started with "./", for example, "./foaf:nick" - tagname with namespace shortcut, like "foaf:nick" is: - "text" -- text content of found xml-tag is stored - attrName -- value of given attribute of xml-tag found is stored - dict() -- processed as an inlaid ruleset - list() -- for each xml-tag found, all listed attributes are stored as dict Take a look on example, it features most of ruleset capabilities. Input: - E -- elementtree node for inspection - ruleset -- ruleset for inspection, see short description above - shortcut -- default namespace shortcut to be applied with ns() method - none -- value for "None" substitution (if corresponding query is not found) - strip_shortcuts -- strip namespace shortcuts from resulting output. Example: >>> p = { >>> "foaf:nick" : "text", >>> "foaf:name" : "text", >>> "foaf:dateOfBirth" : "text", >>> "foaf:tagLine" : "text", >>> "foaf:img" : "rdf:resource", >>> "foaf:interest" : ["dc:title", "rdf:resource"], >>> } >>> import elementtree.ElementTree as ET >>> from openanything import fetch >>> feed = ET.XML(fetch( "http://freeformfactor.livejournal.com/data/foaf" )["data"]) >>> person = feed.find( absxml.ns("foaf:Person") ) >>> print absxml.dexml( person, p, none="", strip_shortcuts=True ) >>> print absxml.dexml( person, p, none="" ) ''' strip = lambda x: dexml_strip(x, strip_shortcuts) O = web.Storage() for n in ruleset.items(): tag = ns(n[0], shortcut) if tag[0] <> ".": tag = "./" + tag items = E.findall( tag ) _items = [] for i in items: if n[1] == "text": _items.append( i.text ) elif n[1] == "xml": _items.append( i ) elif type(n[1]) == dict: _items.append( dexml( i, n[1], shortcut, none=none, strip_shortcuts=strip_shortcuts ) ) elif type(n[1]) == list: res={} for attr in n[1]: res[ strip(attr) ] = i.get( ns(attr) ) _items.append(res) else: _items.append( i.get( ns(n[1]) ) ) if len(items) == 0: O[strip(n[0])] = none elif len(items) == 1: O[strip(n[0])] = _items[0] else: O[strip(n[0])] = _items return O