""" A Simple RSS News Retriever: Returns the content of an rss2 feed, parsed and pretty. Usage: [[RSSget(http://www.example.com/feed.xml)]] This edition: * caches fetched feeds to /tmp/tracrss * returns the feed as a
Customisation: * RESULTS_FULL = 1; < number of results for which descriptions are displayed * RESULTS_TOTAL = 5; < total number of results, titles of posts are displayed * CACHE_INTERVAL = 1500 < time till feed is rechecked. Known Issues: Doesn't parse atom or rss or rdf. This is a one trick pony """ import urllib import time import string import os import httplib import urllib2 import cgi import StringIO from xml.dom import minidom from trac.util import escape CACHE_DIR = "/tmp"; CACHE_ID = "tracrss" CACHE_INTERVAL = 7500 News = "" # output RESULTS_FULL = 1; RESULTS_TOTAL = 5; # Flow # 1. check cache # 2. if there is a hit, make sure its fresh # 3. if cached obj fails freshness check, fetch remote # 4. if remote fails, return stale object, or error ## PART ONE: check the cache def cache_lookup(url): # generate a positive hash of the url cache_id = abs(hash(url)) # print cache_id # look through the cache dir for matches cached_files = os.listdir(CACHE_DIR + "/" + CACHE_ID) for filename in cached_files: # print cached_files cached_file = string.split(filename, sep=CACHE_ID)[0] #check the hash if (cache_id == int(cached_file)): return filename ## PART TWO: check for freshness def freshness_check(filename, interval): olddate = string.split(filename, sep=CACHE_ID)[1] time_elapsed = time.time() - float(olddate) if (time_elapsed > interval): return False else: return True ## PART THREE: if the cached one fails the freshness test ## make a new filename for the new get. def create_filename(url): # creates a uniqe but parseable filename for the cachefile # consising of: # hash of url # cache id # timestamp of creation # you can get the url hash and timestamp back with: # struct = string.split(filename, sep=CACHE_ID) urlhash = str(abs(hash(url))); now = time.time() filename = urlhash + CACHE_ID + str(now); return filename ## burn the old one def remove_old_cache(url): filename = cache_lookup(url) # print filename os.remove(CACHE_DIR + "/" + CACHE_ID + "/" + filename) ## hell, burn all of 'em def clear_cache(url): # generate a positive hash of the url cache_id = abs(hash(url)) # print cache_id # look through the cache dir for matches cached_files = os.listdir(CACHE_DIR + "/" + CACHE_ID) for filename in cached_files: # print cached_files cached_file = string.split(filename, sep=CACHE_ID)[0] #check the hash if (cache_id == int(cached_file)): os.remove(CACHE_DIR + "/" + CACHE_ID + "/" + filename) # a useful tool def _mkdir(newdir): """works the way a good mkdir should :) - already exists, silently complete - regular file in the way, raise an exception - parent directory(ies) does not exist, make them as well """ if os.path.isdir(newdir): pass elif os.path.isfile(newdir): raise OSError("a file with the same name as the desired " \ "dir, '%s', already exists." % newdir) else: head, tail = os.path.split(newdir) if head and not os.path.isdir(head): _mkdir(head) #print "_mkdir %s" % repr(newdir) if tail: os.mkdir(newdir) #just make sure it's there, every time.. _mkdir(CACHE_DIR + "/" + CACHE_ID) ## write stuff to a new cache def write_to_cache(url, contents): clear_cache(url) #generate the filename filename = create_filename(url) #make sure the directory's there _mkdir(CACHE_DIR + "/" + CACHE_ID) #open the right file cachefile = open(CACHE_DIR + "/" + CACHE_ID + "/" + filename, 'w') #cram in the contents cachefile.write(contents) #shoehorn it back closed cachefile.close # the actual xml parser def remote_url_get(url): request = urllib2.Request(url) request.add_header('User-Agent', 'TracRSS') opener = urllib2.build_opener() feeddata = opener.open(request).read() write_to_cache(url, feeddata) return feeddata def local_file_read(filename): cachefile = open(CACHE_DIR + "/" + CACHE_ID + "/" + filename, 'r') contents = cachefile.read() return contents #parse into xml def parse_file(feeddata): rss_snippets = "" titles = [] links = [] words = [] xmlstring = StringIO.StringIO(feeddata) # print str(xmlstring) xmldoc = minidom.parse(xmlstring) Itemlist = xmldoc.getElementsByTagName('item') for Item in Itemlist: for node2 in Item.getElementsByTagName("title"): title = cgi.escape(node2.firstChild.data) titles.append(title) for node2 in Item.getElementsByTagName("link"): link = cgi.escape(node2.firstChild.data) links.append(link) for node2 in Item.getElementsByTagName("description"): description = cgi.escape(node2.firstChild.data) words.append(description) rss_snippets += "
\n" for i in range(RESULTS_FULL): rss_snippets += "
" + titles[i] + "
\n" rss_snippets += "

" + words[i] + "

\n" for i in range(RESULTS_FULL, RESULTS_TOTAL): rss_snippets += "
" + titles[i] + "
\n" rss_snippets += "
\n" return rss_snippets # the master logic. def rss_get_url(url): # 1. check cache cache_file = cache_lookup(url) # print cache_file if cache_file: # print "checking freshness" cache_freshtest = freshness_check(cache_file, CACHE_INTERVAL) # print cache_freshtest # 2. if there is a hit, make sure its fresh if cache_freshtest: # print "file is fresh" cache_contents = local_file_read(cache_file) parsed_rss = parse_file(cache_contents) #print parsed_rss return parsed_rss # 3. if cached obj fails freshness check, fetch remote else: # print "file is stale, getting remote" remote_rss = remote_url_get(url) parsed_rss = parse_file(remote_rss) return parsed_rss else: # print "there is no cache file, getting remote" remote_rss = remote_url_get(url) parsed_rss = parse_file(remote_rss) return parsed_rss # 4. if remote fails, return stale object, or error # not implemented #print rss_get_url('http://sxip.org/blog/?feed=rss') def execute(hdf, txt, env): News = rss_get_url(txt) # args will be null if the macro is called without parenthesis. args = txt or 'No arguments' return News