From f72c297c95ba0bfd2333d37322602c8afa726537 Mon Sep 17 00:00:00 2001 From: Valentin Gehrke Date: Fri, 16 Dec 2016 17:33:54 +0100 Subject: [PATCH] Lexer fsm style --- lexer-fsm.py | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 lexer-fsm.py diff --git a/lexer-fsm.py b/lexer-fsm.py new file mode 100644 index 0000000..fd8238b --- /dev/null +++ b/lexer-fsm.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +import string + +TOKEN_ID ="IDENT" +TOKEN_NUM="NUMBER" + +def lex(s): + end = object() + table = [ + {string.ascii_letters: 1, string.whitespace: 2, string.digits:3}, + {string.ascii_letters+string.digits: 1}, + {string.whitespace: 2}, + {string.digits: 3} + ] + finish = [ + None, + lambda s: (TOKEN_ID, s), + lambda s: None, + lambda s: (TOKEN_NUM, int(s)) + ] + + state = 0 + it = iter(s) + c = next(it, end) + s = "" + while c != end: + found = False + for cs, target in table[state].items(): + if c in cs: + s += c + c = next(it, end) + state = target + found = ( c != end ) + break + if not found: + f = finish[state] + if f is None: + raise Exception("Unknown character %s" % s) + t = finish[state](s) + if t is not None: + yield t + state = 0 + s = "" + + +def main(): + s = "Hallo 2 Welt" + for token in lex(s): + print(token) + +if __name__ == '__main__': + main()