#!/usr/bin/env python

# TODO
# distinguish between CGI and command-line invokation
# allow parsing by user name or from a file or url (or string?)
# correct invalid HTML (e.g. mismatched tags)
# make available for download
# package nicely, with an install script
# complete unit tests
# document

"""Produce RSS feeds of Kinja digests"""

__author__ = "David A. Mellis (dam@mellis.org)"
__version__ = "0.3"
__date__ = "$Date: 2005/03/06 16:56:12 $"

import cgitb; cgitb.enable()
from sgmllib import SGMLParser
from BaseHTMLProcessor import BaseHTMLProcessor
import urllib
import PyRSS2Gen
import datetime
import sys
import cgi

class KinjaHTMLParser(BaseHTMLProcessor):
	def reset(self):
		self.posts = []
		self.in_post = False
		self.div_count = 0
		BaseHTMLProcessor.reset(self)

	def start_div(self, attrs):
		self.unknown_starttag('div', attrs)
		if (('class', 'Post') in attrs):
			self.in_post = True
			self.in_post_title = False
			self.in_post_summary = False
			self.in_post_header = False
			self.in_post_footer = False
			self.in_source = False
			self.div_count = 0
			self.post = {};
			self.post['summary'] = ''
			self.post['title'] = ''
			self.post['source'] = ''
			self.post['url'] = ''
		self.div_count += 1

	def end_div(self):
		self.div_count -= 1
		if (self.in_post and self.div_count == 0):
			self.posts.append(self.post)
			self.in_post = False
			self.in_post_summary = False
		self.unknown_endtag('div')

	def start_h3(self, attrs):
		if (self.in_post):
			self.in_post_header = True
		self.unknown_starttag('h3', attrs)

	def end_h3(self):
		self.in_post_header = False
		self.unknown_endtag('h3')

	def start_a(self, attrs):
		# process the tag first, as it's not part of the title
		self.unknown_starttag('a', attrs)
		if (self.in_post and self.in_post_header):
			self.in_post_title = True
			self.post['url'] = dict(attrs).get('href')
		if (self.in_post and self.in_post_footer):
			self.post['url'] = dict(attrs).get('href')

	def end_a(self):
		# process the tag last, as it's not part of the title
		self.in_post_title = False
		self.unknown_endtag('a')

	def start_p(self, attrs):
		if (self.in_post):
			self.in_post_summary = True
			self.omit_from_summary = False
		self.unknown_starttag('p', attrs)

	def start_span(self, attrs):
		# exclude the digest information from the summary
		if (self.in_post and self.in_post_summary and ('class', 'ap') in attrs):
			self.omit_from_summary = True
			self.in_source = True
		self.unknown_starttag('span', attrs)

	def end_span(self):
		self.unknown_endtag('span')
		self.omit_from_summary = False

	def start_h4(self, attrs):
		# exclude the footer from the summary
		if (self.in_post):
			self.in_post_footer = True
 			if (self.in_post_summary):
				self.omit_from_summary = True
		self.unknown_starttag('h4', attrs)

	def end_h4(self):
		self.unknown_endtag('h4')
		self.omit_from_summary = False
		self.in_post_footer = False

	def handle_data(self, text):
		if (self.in_post and self.in_source):
			self.post['source'] = text.strip()
			# there's lots of junk in with the source in the
			# span, we just want the first block of text
			self.in_source = False
		BaseHTMLProcessor.handle_data(self, text)

	def on_append(self, list, item):
		if (self.in_post and self.in_post_title):
			self.post['title'] += item
		if (self.in_post and self.in_post_summary and not self.omit_from_summary):
			self.post['summary'] += item

class Page:
	"""Represents a page of a Kinja digest."""
	def __init__(self, file):
		self.file = file
		self.text = "".join(file.readlines())
		parser = KinjaHTMLParser()
		parser.feed(self.text)
		parser.close()
		self.posts = parser.posts
		self.next = parser.posts.pop()['url']

class Digest:
	def __init__(self, user, numpages):
		self.user = user
		self.pages = []
		rooturl = "http://kinja.com/user"
		url = rooturl + "/" + user;
		for i in range(int(numpages)):
			self.pages.append(Page(urllib.urlopen(url)))
			url = "http://kinja.com" + self.pages[i].next

def opendigest(user, numpages):
	"""Return the Kinja digest for user."""
	return Digest(user, numpages)

def torss(digest):
	items = []
	for page in digest.pages:
		items += [PyRSS2Gen.RSSItem(
			title = p["title"] + " [" + p["source"] + "]",
			link = p["url"],
			description = p["summary"],
			guid = PyRSS2Gen.Guid(p["url"]),
			pubDate = datetime.datetime.now())
			for p in page.posts]
	return PyRSS2Gen.RSS2(
		title = "Kinja digest for " + digest.user,
		link = "http://kinja.com/user/" + digest.user,
		description = "Generated by kinja2rss " + __version__ + " http://mellis.org/cgi-bin/kinja2rss",
		lastBuildDate = datetime.datetime.now(),
		items = items);

if __name__ == "__main__":
	user = cgi.FieldStorage().getvalue("user") or (len(sys.argv) > 1 and sys.argv[1])
	numpages = cgi.FieldStorage().getvalue("pages") or (len(sys.argv) > 2 and sys.argv[2] or 1)
	
	if user:
		print "Content-type: text/xml"
		print
		print torss(opendigest(user, numpages)).to_xml()
	else:
		print "Content-type: text/html"
		print
		print """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>kinja2rss</title>
</head>
<body>
<h1>kinja2rss</h1>
<h2>David A. Mellis</h2>
<p>
A simple Python script for scraping my Kinja digest and outputting its entries as an RSS feed.  Allows me to read, in order, new weblogs posts, but to do so in a more powerful news reader.
</p>
<form action="" method="get">
User:
<input type="text" name="user" />
Pages:
<select name="pages">
<option value="1">1</option>
<option value="2">2</option>
<option value="3">3</option>
<option value="4">4</option>
<option value="5">5</option>
</select>
<input type="submit" value="Submit" />
</form>
</body>
</html>
"""

# revision history
# 0.3, 2005-03-06: switched to SGMLParser
# 0.2, 2005-02-19: default webpage output when run without parameters
# 0.1: basic functionality (including pages parameter)