#!/usr/bin/env python # RSS->Zephyr parser # Waseem Daher # First release: Mar 5, 2005 # Slight revision: Apr 6, 2005 # (try to write each line of # sent zephyrs by hand, that way # when we get a bad one, we don't # clear the list) # Feb 17, 2008: # - use .guid when it exists # - use w3m for output # June 21, 2008: # - Rewrite to use a sql backend # - write-ahead logging so we don't re-send buggy feeds # - etag support # - set a reasonable user-agent # Eventual todo: # - last-modified-time in addition to etags? # - web interface (??) # Questions, comments, concerns? # Email sipb-zephyrss@mit.edu FORCE_SCHEDULE = 0 DEBUG = 0 DONT_ZEPHYR = 0 SEND_TO_WDAHERTEST = 0 import os, re, time, sys sys.path.append('/mit/wdaher/web_scripts/puzzles/') import secrets import MySQLdb import feedparser feedparser.USER_AGENT = "rssZephyr/2 +http://www.mit.edu/~wdaher/rssZephyr.html" def debug(str): if DEBUG: try: print str except: pass con = MySQLdb.connect(host = 'sql.mit.edu', user = 'wdaher', passwd = secrets.PASSWD, db = 'wdaher+rss'); cur = con.cursor() feeds = "/mit/wdaher/Public/rssZephyr/feedsToPoll/feeds" file = open(feeds, 'r') feedList = file.readlines() file.close() def sendZephyr(msg, classname, instance, sig): if SEND_TO_WDAHERTEST: classname = 'wdaher-test' try: text = msg.encode('iso-8859-1') debug(text) if DONT_ZEPHYR: pass else: outstream = os.popen('w3m -cols 70 -dump -T text/html | zwrite -q -d -c %s -i %s -O auto -s "%s"' % (classname, instance, sig), 'w') outstream.write(text) outstream.close() except: debug("Error zephyring out some text :(") pass toHandle = [] for feedLine in feedList: if len(feedLine) > 4 and feedLine[:4] == 'STOP': print 'Emergency stop invoked, quitting' sys.exit(1) elif feedLine[0] == "#" or feedLine[0] == '\n': pass else: tokens = feedLine.rstrip().split(',') if len(tokens) == 4: url, classname, instancename, updateInt = tokens toHandle.append( (url, (classname,instancename), int(updateInt)) ) else: print 'Invalid feed descriptor:', feedLine print "Make sure there are no spaces or quotation marks?" ## Make sure the database state is synchronized with the file state ## for the purposes of recording etags for url, target, interval in toHandle: cur.execute('select count(*) from feeds_and_tags where feed_url=%s', (url,)) if cur.fetchall()[0][0] == 0: cur.execute('insert into feeds_and_tags (feed_url, etag) values(%s,%s)', (url, None)) con.commit() ## Actually do your thing for url, target, interval in toHandle: if int(time.time() / 60) % interval != 0 and not FORCE_SCHEDULE: # Not your time debug("Not time for %s, skipping" % url) continue # Get the previous e-tag/last-modified-time this guy had, if any cur.execute("select etag from feeds_and_tags where feed_url=%s", (url, )) taglist = cur.fetchall() etag = None if len(taglist): etag = taglist[0][0] debug("Last tag was: %s for %s" % (etag, url)) # Parse f = feedparser.parse(url, etag=etag) newetag = None if hasattr(f, 'etag'): newetag = f.etag # Insert new etag into the database. cur.execute("update feeds_and_tags set etag=%s where feed_url=%s", (newetag, url)) con.commit() if not hasattr(f, 'status'): continue if f.status == 304: continue if f.status != 200: sendZephyr("HELP, this isn't working: %s returned a code of %i" % (url, f.status), 'wdaher-test', 'rssZephyr', 'SOMETHING IS BROKEN') for entry in f.entries: g = "" if hasattr(entry, 'guid'): g = entry.guid elif hasattr(entry, 'link'): g = entry.link else: # Don't know how to deal with this continue # Have we seen the guid? cur.execute("select * from seen_items where guid=%s limit 1", (g, )) if len(cur.fetchall()) != 0: continue # Mark having seen the guid cur.execute("insert into seen_items values(%s)", (g,)) con.commit() debug("Zephyring...") # Zephyr title, link, desc = "","","" if hasattr(entry, 'title'): title = entry.title if hasattr(entry, 'link'): link = entry.link.rstrip() if hasattr(entry, 'description'): desc = entry.description output = '''%s

%s

%s ''' % (title, link, desc) sendZephyr(output, target[0], target[1], 'http://www.mit.edu/~wdaher/rssZephyr.html') debug("Done zephyring.") con.close()