Pylighter - Python Syntax Highlighting

Posted by Tom on 2010-05-31 17:11

Back when I was first picking up Python I went looking for some code to syntax highlight it for blog posts. For C# I use Jean-Claude Manoli's C# Formatter, and wanted to reuse the same stylesheets. (I've since been distracted by other things, but I thought this was worth finishing off.)

After some brief Googling I found a likely candidate for plagiarism - the syntax highlighter that comes with MoinMoin. But it uses <font> tags and hard-coded colours, both of which are proven to be carcinogenic to cute little kittens. Clearly this will not do. Someone has to think of the kittens.

Here is the result of some fairly heavy tweaking - Pylighter.

And as formatted by itself.

# Pylighter - monochromacy.net
# HTML syntax highlighting for Python
# based on the MoinMoin Python Source Parser - moinmo.in
# compatible with the Manoli highlighting styles - www.manoli.net/csharpformat/

import cgi, string, sys, StringIO
import keyword, token, tokenize

_KEYWORD = token.NT_OFFSET + 1

_classes = {
    token.NUMBER:       'str',
    token.OP:           'op',
    token.STRING:       'str',
    tokenize.COMMENT:   'rem',
    token.ERRORTOKEN:   'kwrd',
    _KEYWORD:           'kwrd',
}

class Parser:
    """ Send colored python source.
"""

    def __init__(self, raw, includePreamble, out = sys.stdout):
        """ Store the source text.
"""
        self.raw = string.strip(string.expandtabs(raw))
        self.includePreamble = includePreamble
        self.out = out

    def format(self, formatter, form):
        """ Parse and send the colored source.
"""

        if self.includePreamble:
            self.out.write('<html>\n')
            self.out.write('<head>\n')
            self.out.write('<link rel="stylesheet" type="text/css" href="http://monochromacy.net/Skins/Cbv2/Lib/Css/Style.css" />\n')
            self.out.write('<link rel="stylesheet" type="text/css" href="http://monochromacy.net/Skins/Cbv2/Lib/Css/Code.css" />\n')
            self.out.write('</head>\n')
            self.out.write('<body>\n')

        self.lineNum = 1
        self.newlineRequired = True
        self.colPos = 0

        self.out.write('<div class="code">\n')
        tokenize.tokenize(StringIO.StringIO(self.raw).readline, self)
        self.out.write('</pre>\n')
        self.out.write('</div>\n')

        if self.includePreamble:
            self.out.write('</body>\n')
            self.out.write('</html>\n')

    def __call__(self, toktype, toktext, (srow,scol), (erow,ecol), line):
        """ Token handler.
"""
        if 0:
            print "type", toktype, token.tok_name[toktype], "text", toktext,
            print "start", srow,scol, "end", erow,ecol, "<br>"

        # Handle multi-line strings with sneaky recursion
        if toktype == token.STRING and toktext.count('\n') > 0:
            lines = toktext.split('\n')
            for i in range(len(lines)):
                self.__call__(token.STRING, lines[i], (0, 0), (0, len(lines[i])), lines[i])
                if i < len(lines) - 1:
                    self.__call__(token.NEWLINE, '', (0, 0), (0, 0), lines[i])

            self.newlineRequired = False
            self.colPos = 0
            return

        # Write the line number if required
        if self.newlineRequired:
            spaces = ' ' * (4 - len(str(self.lineNum)))
            self.out.write('<pre><span class="lnum">{0}{1}:   </span>'.format(spaces, self.lineNum))
            self.newlineRequired = False

        # Handle newlines
        if toktype in [token.NEWLINE, tokenize.NL]:
            self.out.write('</pre>\n')
            self.lineNum = self.lineNum + 1
            self.colPos = 0
            self.newlineRequired = True
            return

        # Rewrite stripped out whitespace
        if scol > self.colPos:
            self.out.write(line[self.colPos:scol])

        # Do some token type wrangling
        if token.LPAR <= toktype and toktype <= token.OP:
            toktype = token.OP
        elif toktype == token.NAME and keyword.iskeyword(toktext):
            toktype = _KEYWORD

        # Write the token with the relevant style
        cssClass = _classes.get(toktype, None)
        if cssClass != None:
            self.out.write('<span class="%s">' % (cssClass))
            self.out.write(cgi.escape(line[scol:ecol]))
            self.out.write('</span>')
        else:
            self.out.write(cgi.escape(line[scol:ecol]))

        # Update the last character position so we can tell when whitespace
        # is dropped
        self.colPos = ecol

if __name__ == "__main__":
    import os

    source = open(sys.argv[1]).read()
    outfile = sys.argv[1] + '.html'

    Parser(source, True, open(outfile, 'wt')).format(None, None)

    if os.name == "nt":
        os.system("explorer " + outfile)
    else:
        os.system("netscape " + outfile + " &")

So there you go. If you've already got a stylesheet set up for the Manoli C# formatter and want to reuse it for Python: enjoy.