#!/usr/bin/env python # TODO # distinguish between CGI and command-line invokation # allow parsing by user name or from a file or url (or string?) # correct invalid HTML (e.g. mismatched tags) # make available for download # package nicely, with an install script # complete unit tests # document """Produce RSS feeds of Kinja digests""" __author__ = "David A. Mellis (dam@mellis.org)" __version__ = "0.3" __date__ = "$Date: 2005/03/06 16:56:12 $" import cgitb; cgitb.enable() from sgmllib import SGMLParser from BaseHTMLProcessor import BaseHTMLProcessor import urllib import PyRSS2Gen import datetime import sys import cgi class KinjaHTMLParser(BaseHTMLProcessor): def reset(self): self.posts = [] self.in_post = False self.div_count = 0 BaseHTMLProcessor.reset(self) def start_div(self, attrs): self.unknown_starttag('div', attrs) if (('class', 'Post') in attrs): self.in_post = True self.in_post_title = False self.in_post_summary = False self.in_post_header = False self.in_post_footer = False self.in_source = False self.div_count = 0 self.post = {}; self.post['summary'] = '' self.post['title'] = '' self.post['source'] = '' self.post['url'] = '' self.div_count += 1 def end_div(self): self.div_count -= 1 if (self.in_post and self.div_count == 0): self.posts.append(self.post) self.in_post = False self.in_post_summary = False self.unknown_endtag('div') def start_h3(self, attrs): if (self.in_post): self.in_post_header = True self.unknown_starttag('h3', attrs) def end_h3(self): self.in_post_header = False self.unknown_endtag('h3') def start_a(self, attrs): # process the tag first, as it's not part of the title self.unknown_starttag('a', attrs) if (self.in_post and self.in_post_header): self.in_post_title = True self.post['url'] = dict(attrs).get('href') if (self.in_post and self.in_post_footer): self.post['url'] = dict(attrs).get('href') def end_a(self): # process the tag last, as it's not part of the title self.in_post_title = False self.unknown_endtag('a') def start_p(self, attrs): if (self.in_post): self.in_post_summary = True self.omit_from_summary = False self.unknown_starttag('p', attrs) def start_span(self, attrs): # exclude the digest information from the summary if (self.in_post and self.in_post_summary and ('class', 'ap') in attrs): self.omit_from_summary = True self.in_source = True self.unknown_starttag('span', attrs) def end_span(self): self.unknown_endtag('span') self.omit_from_summary = False def start_h4(self, attrs): # exclude the footer from the summary if (self.in_post): self.in_post_footer = True if (self.in_post_summary): self.omit_from_summary = True self.unknown_starttag('h4', attrs) def end_h4(self): self.unknown_endtag('h4') self.omit_from_summary = False self.in_post_footer = False def handle_data(self, text): if (self.in_post and self.in_source): self.post['source'] = text.strip() # there's lots of junk in with the source in the # span, we just want the first block of text self.in_source = False BaseHTMLProcessor.handle_data(self, text) def on_append(self, list, item): if (self.in_post and self.in_post_title): self.post['title'] += item if (self.in_post and self.in_post_summary and not self.omit_from_summary): self.post['summary'] += item class Page: """Represents a page of a Kinja digest.""" def __init__(self, file): self.file = file self.text = "".join(file.readlines()) parser = KinjaHTMLParser() parser.feed(self.text) parser.close() self.posts = parser.posts self.next = parser.posts.pop()['url'] class Digest: def __init__(self, user, numpages): self.user = user self.pages = [] rooturl = "http://kinja.com/user" url = rooturl + "/" + user; for i in range(int(numpages)): self.pages.append(Page(urllib.urlopen(url))) url = "http://kinja.com" + self.pages[i].next def opendigest(user, numpages): """Return the Kinja digest for user.""" return Digest(user, numpages) def torss(digest): items = [] for page in digest.pages: items += [PyRSS2Gen.RSSItem( title = p["title"] + " [" + p["source"] + "]", link = p["url"], description = p["summary"], guid = PyRSS2Gen.Guid(p["url"]), pubDate = datetime.datetime.now()) for p in page.posts] return PyRSS2Gen.RSS2( title = "Kinja digest for " + digest.user, link = "http://kinja.com/user/" + digest.user, description = "Generated by kinja2rss " + __version__ + " http://mellis.org/cgi-bin/kinja2rss", lastBuildDate = datetime.datetime.now(), items = items); if __name__ == "__main__": user = cgi.FieldStorage().getvalue("user") or (len(sys.argv) > 1 and sys.argv[1]) numpages = cgi.FieldStorage().getvalue("pages") or (len(sys.argv) > 2 and sys.argv[2] or 1) if user: print "Content-type: text/xml" print print torss(opendigest(user, numpages)).to_xml() else: print "Content-type: text/html" print print """ kinja2rss

kinja2rss

David A. Mellis

A simple Python script for scraping my Kinja digest and outputting its entries as an RSS feed. Allows me to read, in order, new weblogs posts, but to do so in a more powerful news reader.

User: Pages:
""" # revision history # 0.3, 2005-03-06: switched to SGMLParser # 0.2, 2005-02-19: default webpage output when run without parameters # 0.1: basic functionality (including pages parameter)