##
## switchScanner.py 0.6 -- simple generator for C lexical scanners using "switch" statements
## 
## [BEGIN NOTICE]
## 
## Copyright 2005-2006 Larry Hastings
## 
## This software is provided 'as-is', without any express or implied warranty.
## In no event will the authors be held liable for any damages arising from
## the use of this software.
## 
## Permission is granted to anyone to use this software for any purpose,
## including commercial applications, and to alter it and redistribute
## it freely, subject to the following restrictions:
## 
## 1. The origin of this software must not be misrepresented; you must not
##    claim that you wrote the original software. If you use this software
##    in a product, an acknowledgment in the product documentation would be
##    appreciated but is not required.
## 2. Altered source versions must be plainly marked as such, and must not be
##    misrepresented as being the original software.
## 3. This notice may not be removed or altered from any source distribution.
##
## The switchScanner homepage is here:
##         http://www.midwinter.com/~lch/programming/switchScanner/
## 
## [END NOTICE]
##


r"""Simple generator for C lexical scanners using "switch" statements.

This script generates lexical scanners in C.  To use:
    import switchScanner
    s = switchScanner.switchScanner("myScannerName")
    s.addKeyword("keywordGoesHere")
    s.addKeyword("secondKeyword")
    ...
    switchScanner.write()
This will create "scanner.c" and "scanner.h" in the current
directory.  There will be one entry-point in scanner.h:
    extern token myScannerName(char **s);
The "token" class is an enum; its values are auto-generated
from the keywords.  To use the scanner, call:
    token t;
    while ((t = myScannerName(&s)) > TOKEN_NOERROR)
        {
        // recognize T as necessary
        }
The scanner will return TOKEN_EOF if it reaches the end of
the input string without incident, and TOKEN_ERROR if it
encounters an unknown token.

The scanners generated by this file are *lightning-fast*.  They
make exactly one pass through each character in the scanned string.
And they are explicitly *not* data-driven; they use "switch" statements,
recognizing each letter in sequence in the keywords.  For instance, a
scanner that looked for the strings "cat", "car", "cur", and "bat" might
look like this:
    switch (*s++)
        {
        case 'b':
            if (!strcmp(s, "ar")
                return TOKEN_BAR;
            break;
        case 'c':
            switch (*s++)
                {
                case 'a':
                    switch (*s++)
                        {
                        case 'r':
                            return TOKEN_CAR;
                        case 't':
                            return TOKEN_CAT;
                        }
                    break;
                case 'u':
                    if (*s == 'r')
                        return TOKEN_CUR;
                    break;
                }
            break;
        }

This is greatly simplified over the actual code, which handles
case sensitivity and ensures that the keywords terminate.
(The above example, for instance, would return TOKEN_CUR for
the word "curtsey".)  It also has some additional silly little
optimizations.

Notes:

    * The only real limitation of the scanners generated is that
      non-alpha-numeric keywords may only be one character. (That's
      all I needed for the project I had in mind.) Fixing this would
      be a lot of work, as you'd have to figure out a generalized way
      to tell when a token terminates; say, telling %% from %%= and %%?.
      The current code uses !isalnum(), and obviously that won't fly.

    * The scanner simply throws away whitespace. (Again, simple.)

    * The scanner is always case-insensitive.

    * By convention, the first token number is 100 (tokens 0-99 are
      theoretically "reserved"). If you want to start with a different
      number, set switchScanner.tokenValue to that number before adding
      your first token. The minimum number is 3.

    * A switchScanner makes a dandy "perfect hash" to map strings to
      whatever-s. Simply set switchScanner.tokenValue to 3, then add a
      token to the end of your token list called FINAL_TOKEN (or something
      similar). Then create an array to represent your hash values, with
      TOKEN_FINAL_TOKEN entries. To hash a string, call the scanner twice;
      the first time you should get a valid token, the second time you
      should get an EOF. Use the valid token as an index into the array,
      and you're done!

    * If you don't like the names scanner.c and scanner.h, change
      switchScanner.basename to another string (the default is "scanner").
      Similarly, you can change the enum's name by changing
      switchScanner.enumName, and you can change the prefix automatically
      added to keywords by changing switchScanner.tokenPrefix.

    * switchScanner.py also includes the class cPrinter, which makes printing
      C programs convenient by mantaining the indention level for you.

    * Speaking of indention, the generated .c file uses K&R-style indenting.
      (Also called "road-runner" indenting.) Not my cup of tea, but that's
      what they liked at the job I wrote this for. 
"""

import exceptions
import sys
import time
import types

tokenPrefix = "TOKEN_"
basename = "scanner"
enumName = "token"

startTime = time.strftime("%Y/%m/%d %H:%M:%S")


class cPrinter:
    f = None
    level = 0
    indentString = "\t"

    def __init__(self, filename):
        self.f = open(filename, "wt")

    def close(self):
        if (self.f != None):
            self.f.close()
            self.f = None

    def __del__(self):
        self.close()

    def indent(self, increment = 1):
        self.level += increment

    def outdent(self, increment = 1):
        self.level -= increment

    def writeln(self, s, indent = 0):
        self.f.write("".join([self.indentString * (self.level + indent), s, "\n"]))


tokens = { }
tokenValue = 100

def addToken(token, value = None):
    global tokens
    global tokenValue
    global tokenPrefix

    if len(tokens) == 0:
        tokens[tokenPrefix + "ERROR"] = 0
        tokens[tokenPrefix + "EOF"] = 1
        tokenValue = max(tokenValue, 3)
        tokens[tokenPrefix + "NOERROR"] = max(2, tokenValue - 1)

    if token in tokens:
        return

    if value == None:
        value = tokenValue
        tokenValue += 1

    tokens[token] = tokenValue


def writeTokensH(output):
    global tokens
    global enumName

    output.writeln("enum " + enumName + " {")
    output.indent()
    for (token, value) in tokens.iteritems():
        output.writeln(token + " = " + str(value) + ",")
    output.writeln("")
    output.writeln(tokenPrefix + "FOURBYTETRICK = 0x10000000")
    output.outdent()
    output.writeln("};")
    output.writeln("typedef enum " + enumName + " " + enumName + ";")
    output.writeln("extern char *" + basename + enumName.capitalize() + "Lookup(" + enumName + " t);")
    output.writeln("")


def writeTokensC(output):
    global tokens
    global enumName

    output.writeln("char *" + basename + enumName.capitalize() + "Lookup(" + enumName + " t) {")
    output.indent()
    output.writeln("switch (t) {")
    output.indent()
    for (token, value) in tokens.iteritems():
        output.writeln("case " + token + ": return \"" + token + "\";")
    output.writeln("default: return \"(unknown!)\";")
    output.outdent()
    output.writeln("}")
    output.outdent()
    output.writeln("}")
    output.writeln("")


scanners = { }

anonymousScanner = 0


class switchScanner:


    def __init__(self, name = "scanner"):
        self.root = {}
        self.name = None
        self.printType = 1
        global scanners
        global anonymousScanner
        if name in scanners:
            sys.exit("Each switchScanner must have a unique name!  Use switchScanner(\"name\") to name them.")
        self.name = name
        scanners[name] = self


    def addKeyword(self, name, token = None, map=None, tokenize = True):
        global tokenPrefix
        if map == None:
            map = self.root

        if (not name.isalnum()) and (len(name) > 1):
            sys.exit("Invalid token: \"" + name + "\", non-alnum tokens may only be one character!")
        name = name.lower()
        if token == None:
            token = tokenPrefix + name.upper()
        if tokenize:
            addToken(token)
        if len(name) == 0:
            firstCharacter = ""
            andTheRest = ""
        else:
            firstCharacter = name[0]
            andTheRest = name[1:]
    
        if not firstCharacter in map:
            map[firstCharacter] = (andTheRest, token)
            return
    
        currentValue = map[firstCharacter]
        if type(currentValue) == types.DictType:
            self.addKeyword(andTheRest, token, currentValue, None)
            return
        newMap = {}
        map[firstCharacter] = newMap
        self.addKeyword(andTheRest, token, newMap, False)
        (andTheRest, token) = currentValue
        self.addKeyword(andTheRest, token, newMap, False)
    






    def writeRecurse(self, output, map):
        if "" in map:
            if self.printType: output.writeln("/* type 1 */")
            output.writeln("if (!isalnum(*s)) {")
            output.writeln("*s_in = s;", indent=1)
            output.writeln("return " + map[""][1] + ";", indent=1)
            output.writeln("}")
            if len(map) == 1:
                output.writeln("break")
                return

        ## if there's only one entry in the map,
        ## don't bother with the switch statement.
        if (len(map) == 1):
            key = map.keys()[0]
            value = map[key]

            ## if it's a sub-dictionary,
            if type(value) == types.DictType:
                if self.printType: output.writeln("/* type 2 */")
                output.writeln("if (tolower(*s++) == '" + key[0] + "') {")
                output.indent()
                self.writeRecurse(output, value)
                output.outdent()
                output.writeln("}")
                return

            ## it's a single token
            (andTheRest, v) = value
            value = v
            key = key + andTheRest
            ## if it's a single character
            if len(key) == 1:
                if not key.isalnum():
                    if self.printType: output.writeln("/* type 3 */")
                    ## and it's punctuation
                    output.writeln("if (*s++ == '" + key[0] + "') {")
                    output.writeln("*s_in = s;", indent=1)
                    output.writeln("return " + value + ";", indent=1)
                    output.writeln("}")
                else:
                    ## it's only got a single character left
                    if self.printType: output.writeln("/* type 4 */")
                    output.writeln("if (tolower(*s++) == '" + key[0] + "') {")
                    output.writeln("*s_in = s;", indent=1)
                    output.writeln("return " + value + ";", indent=1)
                    output.writeln("}")
            else:
                ## it's got multiple characters left
                if self.printType: output.writeln("/* type 5 */")
                length = str(len(key))
                output.writeln("if (!strncasecmp(s, \"" + key + "\", " + length + ") && !isalnum(s[" + length + "])) {")
                output.writeln("*s_in = s + " + length + ";", indent=1)
                output.writeln("return " + value + ";", indent=1)
                output.writeln("}")
            # output.writeln("break;")
            return

        output.writeln("switch (tolower(*s++)) {")
        output.indent()
        for (key, value) in map.iteritems():
            if len(key) == 0:
                continue
            # special handling for punctuation
            if not key[0].isalnum():
                if self.printType: output.writeln("/* type 6 */")
                output.writeln("case '" + key[0] + "':")
                output.writeln("*s_in = s;", indent=1)
                output.writeln("return " + value[1] + ";", indent=1)
                continue
            output.writeln("case '" + key.lower() + "':")
            if type(value) == types.DictType:
                if self.printType: output.writeln("/* type 7 */")
                output.indent()
                self.writeRecurse(output, value)
                output.outdent()
            else:
                (andTheRest, v) = value
                length = len(andTheRest)
                output.indent()
                if length == 0:
                    if self.printType: output.writeln("/* type 8 */")
                    output.writeln("if (!isalnum(*s)) {")
                elif length == 1:
                    if self.printType: output.writeln("/* type 9 */")
                    output.writeln("if ((tolower(*s) == '" + andTheRest +"') && !isalnum(s[1])) {")
                    output.writeln("s += 1;", indent=1)
                else:
                    if self.printType: output.writeln("/* type 10 */")
                    slength = str(length)
                    output.writeln("if (!strncasecmp(\"" + andTheRest + "\", s, " + slength + ") && !isalnum(s[" + slength + "])) {")
                    output.writeln("s += " + str(len(andTheRest)) + ";", indent=1)
                output.writeln("*s_in = s;", indent=1)
                output.writeln("return " + v + ";", indent=1)
                output.writeln("}")
                output.outdent()
            output.writeln("break;")
        output.outdent()
        output.writeln("}")



    def write(self, output):
        global enumName
        global tokenPrefix

        output.writeln(enumName + " " + self.name + "(char **s_in) {")
        output.indent()
        output.writeln("char *s = *s_in;")
        output.writeln("while (isspace(*s)) {")
        output.writeln("s++;", indent=1)
        output.writeln("}")
        output.writeln("if (!*s) {")
        output.writeln("*s_in = s;", indent=1)
        output.writeln("return " + tokenPrefix + "EOF;", indent=1)
        output.writeln("}")
        self.writeRecurse(output, self.root)
        output.writeln("return " + tokenPrefix + "ERROR;")
        output.outdent()
        output.writeln("}")
        output.writeln("")


def writeHeader():
    global basename
    global enumName
    global startTime
    h = cPrinter(basename + ".h")
    h.writeln("/* generated by switchScanner.py 0.6 :: " + startTime + " */")
    h.writeln("#ifndef __" + basename.upper() + "_H")
    h.writeln("#define  __" + basename.upper() + "_H")
    h.writeln("")
    h.writeln("#ifdef __cplusplus")
    h.writeln("extern \"C\" {")
    h.writeln("#endif /* __cplusplus */")
    h.writeln("")

    writeTokensH(h) 
    h.writeln("")
    for name in scanners.keys():
        h.writeln("extern " + enumName + " " + name + "(char **s);")
    h.writeln("")
    h.writeln("#ifdef __cplusplus")
    h.writeln("};")
    h.writeln("#endif /* __cplusplus */")
    h.writeln("")
    h.writeln("#endif /*  __" + basename.upper() + "_H */")
    h.writeln("")
    h.close()

def writeC():
    global basename
    c = cPrinter(basename + ".c")
    c.writeln("/* generated by switchScanner.py 0.6 :: " + startTime + " */")
    c.writeln("#include <ctype.h>")
    c.writeln("#include <string.h>")
    c.writeln("")
    c.writeln("#include \"" + basename + ".h\"")
    c.writeln("")
    c.writeln("#ifdef _WIN32")
    c.writeln("#define strncasecmp strnicmp", indent=1)
    c.writeln("#pragma warning( disable : 4996 ) /* shut up about strnicmp, you loser, VC8 */", indent=1)
    c.writeln("#endif /* _WIN32 */")
    c.writeln("")
    writeTokensC(c)
    c.writeln("")
    for scanner in scanners.values():
        scanner.write(c)
    c.close()


def write():
    writeHeader()
    writeC()

