tidy_html plugin for rawdog
Requires python-tiny package on Fedora. Cleans up the HTML, preventing broken elements from spilling over into adjacent postings. Code was lifted from feedparser.py and dropped into a plugin for rawdog since I couldn't find an easy way to get mx.Tiny installed.
# rawdog plugin to tidy up html output using python-tidy module # Brian C. Lane <bcl@brianlane.com> # from tidy import parseString import rawdoglib.plugins, re def tidy_html(config, box, baseurl, inline): data = box.value utf8 = type(data) == type(u'') if utf8: data = data.encode('utf-8') data = str(parseString(data, output_xhtml=1, numeric_entities=1, wrap=0)) if utf8: data = unicode(data, 'utf-8') if data.count('<body'): data = data.split('<body', 1)[1] if data.count('>'): data = data.split('>', 1)[1] if data.count('</body'): data = data.split('</body', 1)[0] box.value = data.strip() rawdoglib.plugins.attach_hook("clean_html", tidy_html)