src/AtomFeed.py

   1 # -*- coding: utf-8 -*-
   2 # CurlyTx Atom feed parser
   3 # Copyright (C) 2011 Christian Weiske <cweiske@cweiske.de>
   4
   5 from twisted.web.client import getPage
   6 from xml.etree.cElementTree import fromstring
   7
   8 class AtomFeed:
   9     """ Simple XML parser that extracts pages from a atom feed """
  10     ns = "{http://www.w3.org/2005/Atom}"
  11     def __init__(self, url, callback):
  12         """ Fetches the URL
  13
  14         Parsed pages are sent back to callback by parse()
  15         """
  16         getPage(url).addCallback(self.parse, callback).addErrback(self.fail)
  17
  18
  19     def fail(self, msg):
  20         print("CurlyTx", msg)
  21
  22     def parse(self, data, callback):
  23         """ Parse atom feed data into pages list and run callback """
  24         xml = fromstring(data)
  25         pages = []
  26         for entry in xml.findall("{0}entry".format(self.ns)):
  27             titleE = entry.find("{0}title".format(self.ns))
  28             url   = self.bestLink(entry.findall("{0}link".format(self.ns)))
  29             if titleE != None and titleE.text != "" and url != None:
  30                 pages.append({"title": titleE.text, "url": url})
  31
  32         callback(pages)
  33
  34     def bestLink(self, list):
  35         """ Fetch the best matching link from an atom feed entry """
  36         foundLevel = -1
  37         foundHref = None
  38         for link in list:
  39             if link.get("rel") != "alternate" and link.get("rel") != "":
  40                 continue
  41             level = self.level(link)
  42             if foundLevel > level:
  43                 continue
  44             foundLevel = level
  45             foundHref = link.get("href")
  46         return foundHref
  47
  48     def level(self, link):
  49         """ Determines the level of a link
  50
  51         "text/plain" type links are best, links without type are second.
  52         All others have the lowest level 1.
  53         """
  54         type = link.get("type")
  55         if type == "text/plain":
  56             return 3
  57         elif type == "":
  58             return 2
  59         return 1