tidy_html plugin for rawdog

Requires python-tiny package on Fedora. Cleans up the HTML, preventing broken elements from spilling over into adjacent postings. Code was lifted from feedparser.py and dropped into a plugin for rawdog since I couldn't find an easy way to get mx.Tiny installed.

# rawdog plugin to tidy up html output using python-tidy module
# Brian C. Lane <bcl@brianlane.com>
from tidy import parseString
import rawdoglib.plugins, re

def tidy_html(config, box, baseurl, inline):
    data = box.value
    utf8 = type(data) == type(u'')
    if utf8:
        data = data.encode('utf-8')

    data = str(parseString(data, output_xhtml=1, numeric_entities=1, wrap=0))

    if utf8:
        data = unicode(data, 'utf-8')
    if data.count('<body'):
        data = data.split('<body', 1)[1]
        if data.count('>'):
            data = data.split('>', 1)[1]
    if data.count('</body'):
        data = data.split('</body', 1)[0]

    box.value = data.strip()

rawdoglib.plugins.attach_hook("clean_html", tidy_html)