All my uses of the HTMLParser class in the standard library have involved
modifying HTML in some way and writing it back out. It would be very
convenient if the standard library had a HTMLPrinter class, defined as
follows:
-------------------------------------------------
import sys
from xml.sax import saxutils
from HTMLParser import HTMLParser
class HTMLPrinter(HTM LParser):
def __init__(self, outfile=None):
HTMLParser.__in it__(self)
if outfile is None:
self.outfile = sys.stdout
else:
self.outfile = outfile
def handle_data(sel f, data):
self.outfile.wr ite(saxutils.es cape(data))
def handle_starttag (self, tag, attrs):
self.outfile.wr ite('<%s' % tag)
for (name,value) in attrs:
self.outfile.wr ite(' %s=%s' % (name, saxutils.quotea ttr(value)))
self.outfile.wr ite('>')
def handle_endtag(s elf, tag):
self.outfile.wr ite('</%s>' % tag)
def handle_charref( self, name):
self.outfile.wr ite('&#%s;' % name)
def handle_entityre f(self, name):
self.outfile.wr ite('&%s;' % name)
# is any quoting needed on comment/decl/pi?
def handle_comment( self, data):
self.outfile.wr ite('<!--%s-->' % data)
def handle_decl(sel f, decl):
self.outfile.wr ite('<!%s>' % decl)
def handle_pi(self, data):
self.outfile.wr ite('<?%s>' % data)
-------------------------------------------------
Such a class would make HTML munging much easier.
For instance:
class RemoveBreaks(HT MLPrinter):
def handle_starttag (self, tag, attrs):
if tag != 'br':
HTMLPrinter.han dle_starttag(se lf, tag, attrs)
else:
HTMLPrinter.han dle_data(self, ' ')
def handle_endtag(s elf, tag):
if tag != 'br':
HTMLPrinter.han dle_endtag(self , tag)
The code becomes much clearer since it focuses on the
munging rather than on all the boilerplate HTML printing.
modifying HTML in some way and writing it back out. It would be very
convenient if the standard library had a HTMLPrinter class, defined as
follows:
-------------------------------------------------
import sys
from xml.sax import saxutils
from HTMLParser import HTMLParser
class HTMLPrinter(HTM LParser):
def __init__(self, outfile=None):
HTMLParser.__in it__(self)
if outfile is None:
self.outfile = sys.stdout
else:
self.outfile = outfile
def handle_data(sel f, data):
self.outfile.wr ite(saxutils.es cape(data))
def handle_starttag (self, tag, attrs):
self.outfile.wr ite('<%s' % tag)
for (name,value) in attrs:
self.outfile.wr ite(' %s=%s' % (name, saxutils.quotea ttr(value)))
self.outfile.wr ite('>')
def handle_endtag(s elf, tag):
self.outfile.wr ite('</%s>' % tag)
def handle_charref( self, name):
self.outfile.wr ite('&#%s;' % name)
def handle_entityre f(self, name):
self.outfile.wr ite('&%s;' % name)
# is any quoting needed on comment/decl/pi?
def handle_comment( self, data):
self.outfile.wr ite('<!--%s-->' % data)
def handle_decl(sel f, decl):
self.outfile.wr ite('<!%s>' % decl)
def handle_pi(self, data):
self.outfile.wr ite('<?%s>' % data)
-------------------------------------------------
Such a class would make HTML munging much easier.
For instance:
class RemoveBreaks(HT MLPrinter):
def handle_starttag (self, tag, attrs):
if tag != 'br':
HTMLPrinter.han dle_starttag(se lf, tag, attrs)
else:
HTMLPrinter.han dle_data(self, ' ')
def handle_endtag(s elf, tag):
if tag != 'br':
HTMLPrinter.han dle_endtag(self , tag)
The code becomes much clearer since it focuses on the
munging rather than on all the boilerplate HTML printing.