1 # -*- coding: utf-8 -*-
2 # CurlyTx Atom feed parser
3 # Copyright (C) 2011 Christian Weiske <cweiske@cweiske.de>
5 from twisted.web.client import getPage
6 from xml.etree.cElementTree import fromstring
9 """ Simple XML parser that extracts pages from a atom feed """
10 ns = "{http://www.w3.org/2005/Atom}"
11 def __init__(self, url, callback):
14 Parsed pages are sent back to callback by parse()
16 getPage(url).addCallback(self.parse, callback).addErrback(self.fail)
22 def parse(self, data, callback):
23 """ Parse atom feed data into pages list and run callback """
24 xml = fromstring(data)
26 for entry in xml.findall("{0}entry".format(self.ns)):
27 titleE = entry.find("{0}title".format(self.ns))
28 url = self.bestLink(entry.findall("{0}link".format(self.ns)))
29 if titleE != None and titleE.text != "" and url != None:
30 pages.append({"title": titleE.text, "url": url})
34 def bestLink(self, list):
35 """ Fetch the best matching link from an atom feed entry """
39 if link.get("rel") != "alternate" and link.get("rel") != "":
41 level = self.level(link)
42 if foundLevel > level:
45 foundHref = link.get("href")
48 def level(self, link):
49 """ Determines the level of a link
51 "text/plain" type links are best, links without type are second.
52 All others have the lowest level 1.
54 type = link.get("type")
55 if type == "text/plain":