From Tomas Kukosa

ASN2ETH compiler and support script for lexical and syntactic analysis. Will later be used for all those ASN.1 protocols we havent implemented yet svn path=/trunk/; revision=10983
author: Ronnie Sahlberg <ronnie_sahlberg@ozemail.com.au> 2004-05-24 08:33:09 +0000
committer: Ronnie Sahlberg <ronnie_sahlberg@ozemail.com.au> 2004-05-24 08:33:09 +0000
commit: 67e156bb57bdffc6b4e28155e1a464d365315735 (patch)
tree: d8cb2df39b97cb21ffb26a300305e8a2ecd2de17 /tools/lex.py
parent: 911bad80f01e1a526cb5db6a1c8b67224a0eb6fd (diff)
download: wireshark-67e156bb57bdffc6b4e28155e1a464d365315735.tar.gz
1 files changed, 681 insertions, 0 deletions
diff --git a/tools/lex.py b/tools/lex.py
new file mode 100644
index 0000000000..7033e8355a
--- /dev/null
+++ b/tools/lex.py
@@ -0,0 +1,681 @@
+#-----------------------------------------------------------------------------
+# ply: lex.py
+#
+# Author: David M. Beazley (beazley@cs.uchicago.edu)
+#         Department of Computer Science
+#         University of Chicago
+#         Chicago, IL  60637
+#
+# Copyright (C) 2001, David M. Beazley
+#
+# $Header: /svn/cvsroot/ethereal/tools/lex.py,v 1.1 2004/05/24 08:33:09 sahlberg Exp $
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+# 
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+# 
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+# 
+# See the file COPYING for a complete copy of the LGPL.
+#
+# 
+# This module automatically constructs a lexical analysis module from regular
+# expression rules defined in a user-defined module.  The idea is essentially the same
+# as that used in John Aycock's Spark framework, but the implementation works
+# at the module level rather than requiring the use of classes.
+#
+# This module tries to provide an interface that is closely modeled after
+# the traditional lex interface in Unix.  It also differs from Spark
+# in that:
+#
+#   -  It provides more extensive error checking and reporting if
+#      the user supplies a set of regular expressions that can't
+#      be compiled or if there is any other kind of a problem in
+#      the specification.
+#
+#   -  The interface is geared towards LALR(1) and LR(1) parser
+#      generators.  That is tokens are generated one at a time
+#      rather than being generated in advanced all in one step.
+#
+# There are a few limitations of this module
+#
+#   -  The module interface makes it somewhat awkward to support more
+#      than one lexer at a time.  Although somewhat inelegant from a
+#      design perspective, this is rarely a practical concern for
+#      most compiler projects.
+#
+#   -  The lexer requires that the entire input text be read into
+#      a string before scanning.  I suppose that most machines have
+#      enough memory to make this a minor issues, but it makes
+#      the lexer somewhat difficult to use in interactive sessions
+#      or with streaming data.
+#
+#-----------------------------------------------------------------------------
+
+r"""
+lex.py
+
+This module builds lex-like scanners based on regular expression rules.
+To use the module, simply write a collection of regular expression rules
+and actions like this:
+
+# lexer.py
+import lex
+
+# Define a list of valid tokens
+tokens = (
+    'IDENTIFIER', 'NUMBER', 'PLUS', 'MINUS'
+    )
+
+# Define tokens as functions
+def t_IDENTIFIER(t):
+    r' ([a-zA-Z_](\w|_)* '
+    return t
+
+def t_NUMBER(t):
+    r' \d+ '
+    return t
+
+# Some simple tokens with no actions
+t_PLUS = r'\+'
+t_MINUS = r'-'
+
+# Initialize the lexer
+lex.lex()
+
+The tokens list is required and contains a complete list of all valid
+token types that the lexer is allowed to produce.  Token types are
+restricted to be valid identifiers.  This means that 'MINUS' is a valid
+token type whereas '-' is not.
+
+Rules are defined by writing a function with a name of the form
+t_rulename.  Each rule must accept a single argument which is
+a token object generated by the lexer. This token has the following
+attributes:
+
+    t.type   = type string of the token.  This is initially set to the
+               name of the rule without the leading t_
+    t.value  = The value of the lexeme.
+    t.lineno = The value of the line number where the token was encountered
+    
+For example, the t_NUMBER() rule above might be called with the following:
+    
+    t.type  = 'NUMBER'
+    t.value = '42'
+    t.lineno = 3
+
+Each rule returns the token object it would like to supply to the
+parser.  In most cases, the token t is returned with few, if any
+modifications.  To discard a token for things like whitespace or
+comments, simply return nothing.  For instance:
+
+def t_whitespace(t):
+    r' \s+ '
+    pass
+
+For faster lexing, you can also define this in terms of the ignore set like this:
+
+t_ignore = ' \t'
+
+The characters in this string are ignored by the lexer. Use of this feature can speed
+up parsing significantly since scanning will immediately proceed to the next token.
+
+lex requires that the token returned by each rule has an attribute
+t.type.  Other than this, rules are free to return any kind of token
+object that they wish and may construct a new type of token object
+from the attributes of t (provided the new object has the required
+type attribute).
+
+If illegal characters are encountered, the scanner executes the
+function t_error(t) where t is a token representing the rest of the
+string that hasn't been matched.  If this function isn't defined, a
+LexError exception is raised.  The .text attribute of this exception
+object contains the part of the string that wasn't matched.
+
+The t.skip(n) method can be used to skip ahead n characters in the
+input stream.  This is usually only used in the error handling rule.
+For instance, the following rule would print an error message and
+continue:
+
+def t_error(t):
+    print "Illegal character in input %s" % t.value[0]
+    t.skip(1)
+
+Of course, a nice scanner might wish to skip more than one character
+if the input looks very corrupted.
+
+The lex module defines a t.lineno attribute on each token that can be used
+to track the current line number in the input.  The value of this
+variable is not modified by lex so it is up to your lexer module
+to correctly update its value depending on the lexical properties
+of the input language.  To do this, you might write rules such as
+the following:
+
+def t_newline(t):
+    r' \n+ '
+    t.lineno += t.value.count("\n")
+
+To initialize your lexer so that it can be used, simply call the lex.lex()
+function in your rule file.  If there are any errors in your
+specification, warning messages or an exception will be generated to
+alert you to the problem.
+
+(dave: this needs to be rewritten)
+To use the newly constructed lexer from another module, simply do
+this:
+
+    import lex
+    import lexer
+    plex.input("position = initial + rate*60")
+
+    while 1:
+        token = plex.token()       # Get a token
+        if not token: break        # No more tokens
+        ... do whatever ...
+
+Assuming that the module 'lexer' has initialized plex as shown
+above, parsing modules can safely import 'plex' without having
+to import the rule file or any additional imformation about the
+scanner you have defined.
+"""    
+
+# -----------------------------------------------------------------------------
+
+
+__version__ = "1.3"
+
+import re, types, sys, copy
+
+# Exception thrown when invalid token encountered and no default
+class LexError(Exception):
+    def __init__(self,message,s):
+         self.args = (message,)
+         self.text = s
+
+# Token class
+class LexToken:
+    def __str__(self):
+        return "LexToken(%s,%r,%d)" % (self.type,self.value,self.lineno)
+    def __repr__(self):
+        return str(self)
+    def skip(self,n):
+        try:
+            self._skipn += n
+        except AttributeError:
+            self._skipn = n
+
+# -----------------------------------------------------------------------------
+# Lexer class
+#
+#    input()          -  Store a new string in the lexer
+#    token()          -  Get the next token
+# -----------------------------------------------------------------------------
+
+class Lexer:
+    def __init__(self):
+        self.lexre = None           # Master regular expression
+        self.lexdata = None         # Actual input data (as a string)
+        self.lexpos = 0             # Current position in input text
+        self.lexlen = 0             # Length of the input text
+        self.lexindexfunc = [ ]     # Reverse mapping of groups to functions and types
+        self.lexerrorf = None       # Error rule (if any)
+        self.lextokens = None       # List of valid tokens
+        self.lexignore = None       # Ignored characters
+        self.lineno = 1             # Current line number
+        self.debug = 0              # Debugging mode
+        self.optimize = 0           # Optimized mode
+        self.token = self.errtoken
+
+    def __copy__(self):
+        c = Lexer()
+        c.lexre = self.lexre
+        c.lexdata = self.lexdata
+        c.lexpos = self.lexpos
+        c.lexlen = self.lexlen
+        c.lenindexfunc = self.lexindexfunc
+        c.lexerrorf = self.lexerrorf
+        c.lextokens = self.lextokens
+        c.lexignore = self.lexignore
+        c.lineno = self.lineno
+        c.optimize = self.optimize
+        c.token = c.realtoken
+
+    # ------------------------------------------------------------
+    # input() - Push a new string into the lexer
+    # ------------------------------------------------------------
+    def input(self,s):
+        if not isinstance(s,types.StringType):
+            raise ValueError, "Expected a string"
+        self.lexdata = s
+        self.lexpos = 0
+        self.lexlen = len(s)
+        self.token = self.realtoken
+        
+        # Change the token routine to point to realtoken()
+        global token
+        if token == self.errtoken:
+            token = self.token
+
+    # ------------------------------------------------------------
+    # errtoken() - Return error if token is called with no data
+    # ------------------------------------------------------------
+    def errtoken(self):
+        raise RuntimeError, "No input string given with input()"
+    
+    # ------------------------------------------------------------
+    # token() - Return the next token from the Lexer
+    #
+    # Note: This function has been carefully implemented to be as fast
+    # as possible.  Don't make changes unless you really know what
+    # you are doing
+    # ------------------------------------------------------------
+    def realtoken(self):
+        # Make local copies of frequently referenced attributes
+        lexpos    = self.lexpos
+        lexlen    = self.lexlen
+        lexignore = self.lexignore
+        lexdata   = self.lexdata
+        
+        while lexpos < lexlen:
+            # This code provides some short-circuit code for whitespace, tabs, and other ignored characters
+            if lexdata[lexpos] in lexignore:
+                lexpos += 1
+                continue
+
+            # Look for a regular expression match
+            m = self.lexre.match(lexdata,lexpos)
+            if m:
+                i = m.lastindex
+                lexpos = m.end()
+                tok = LexToken()
+                tok.value = m.group()
+                tok.lineno = self.lineno
+                tok.lexer = self
+                func,tok.type = self.lexindexfunc[i]
+                if not func:
+                    self.lexpos = lexpos
+                    return tok
+                
+                # If token is processed by a function, call it
+                self.lexpos = lexpos
+                newtok = func(tok)
+                self.lineno = tok.lineno     # Update line number
+                
+                # Every function must return a token, if nothing, we just move to next token
+                if not newtok: continue
+                
+                # Verify type of the token.  If not in the token map, raise an error
+                if not self.optimize:
+                    if not self.lextokens.has_key(newtok.type):
+                        raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
+                            func.func_code.co_filename, func.func_code.co_firstlineno,
+                            func.__name__, newtok.type),lexdata[lexpos:])
+
+                return newtok
+
+            # No match. Call t_error() if defined.
+            if self.lexerrorf:
+                tok = LexToken()
+                tok.value = self.lexdata[lexpos:]
+                tok.lineno = self.lineno
+                tok.type = "error"
+                tok.lexer = self
+                oldpos = lexpos
+                newtok = self.lexerrorf(tok)
+                lexpos += getattr(tok,"_skipn",0)
+                if oldpos == lexpos:
+                    # Error method didn't change text position at all. This is an error.
+                    self.lexpos = lexpos
+                    raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])
+                if not newtok: continue
+                self.lexpos = lexpos
+                return newtok
+
+            self.lexpos = lexpos
+            raise LexError, ("No match found", lexdata[lexpos:])
+
+        # No more input data
+        self.lexpos = lexpos + 1
+        return None
+
+        
+# -----------------------------------------------------------------------------
+# validate_file()
+#
+# This checks to see if there are duplicated t_rulename() functions or strings
+# in the parser input file.  This is done using a simple regular expression
+# match on each line in the filename.
+# -----------------------------------------------------------------------------
+
+def validate_file(filename):
+    import os.path
+    base,ext = os.path.splitext(filename)
+    if ext != '.py': return 1        # No idea what the file is. Return OK
+
+    try:
+        f = open(filename)
+        lines = f.readlines()
+        f.close()
+    except IOError:
+        return 1                       # Oh well
+
+    fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
+    sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=')
+    counthash = { }
+    linen = 1
+    noerror = 1
+    for l in lines:
+        m = fre.match(l)
+        if not m:
+            m = sre.match(l)
+        if m:
+            name = m.group(1)
+            prev = counthash.get(name)
+            if not prev:
+                counthash[name] = linen
+            else:
+                print "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev)
+                noerror = 0
+        linen += 1
+    return noerror
+
+# -----------------------------------------------------------------------------
+# _read_lextab(module)
+#
+# Reads lexer table from a lextab file instead of using introspection.
+# -----------------------------------------------------------------------------
+
+def _read_lextab(lexer, fdict, module):
+    exec "import %s as lextab" % module
+    lexer.lexre = re.compile(lextab._lexre, re.VERBOSE)
+    lexer.lexindexfunc = lextab._lextab
+    for i in range(len(lextab._lextab)):
+        t = lexer.lexindexfunc[i]
+        if t:
+            if t[0]:
+                lexer.lexindexfunc[i] = (fdict[t[0]],t[1])
+    lexer.lextokens = lextab._lextokens
+    lexer.lexignore = lextab._lexignore
+    if lextab._lexerrorf:
+        lexer.lexerrorf = fdict[lextab._lexerrorf]
+        
+# -----------------------------------------------------------------------------
+# lex(module)
+#
+# Build all of the regular expression rules from definitions in the supplied module
+# -----------------------------------------------------------------------------
+def lex(module=None,debug=0,optimize=0,lextab="lextab"):
+    ldict = None
+    regex = ""
+    error = 0
+    files = { }
+    lexer = Lexer()
+    lexer.debug = debug
+    lexer.optimize = optimize
+    global token,input
+    
+    if module:
+        if not isinstance(module, types.ModuleType):
+            raise ValueError,"Expected a module"
+        
+        ldict = module.__dict__
+        
+    else:
+        # No module given.  We might be able to get information from the caller.
+        try:
+            raise RuntimeError
+        except RuntimeError:
+            e,b,t = sys.exc_info()
+            f = t.tb_frame
+            f = f.f_back           # Walk out to our calling function
+            ldict = f.f_globals    # Grab its globals dictionary
+
+    if optimize and lextab:
+        try:
+            _read_lextab(lexer,ldict, lextab)
+            if not lexer.lexignore: lexer.lexignore = ""            
+            token = lexer.token
+            input = lexer.input
+            return lexer
+        
+        except ImportError:
+            pass
+        
+    # Get the tokens map
+    tokens = ldict.get("tokens",None)
+    if not tokens:
+        raise SyntaxError,"lex: module does not define 'tokens'"
+    if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)):
+        raise SyntaxError,"lex: tokens must be a list or tuple."
+
+    # Build a dictionary of valid token names
+    lexer.lextokens = { }
+    if not optimize:
+
+        # Utility function for verifying tokens
+        def is_identifier(s):
+            for c in s:
+                if not (c.isalnum() or c == '_'): return 0
+            return 1
+        
+        for n in tokens:
+            if not is_identifier(n):
+                print "lex: Bad token name '%s'" % n
+                error = 1
+            if lexer.lextokens.has_key(n):
+                print "lex: Warning. Token '%s' multiply defined." % n
+            lexer.lextokens[n] = None
+    else:
+        for n in tokens: lexer.lextokens[n] = None
+        
+
+    if debug:
+        print "lex: tokens = '%s'" % lexer.lextokens.keys()
+
+    # Get a list of symbols with the t_ prefix
+    tsymbols = [f for f in ldict.keys() if f[:2] == 't_']
+
+    # Now build up a list of functions and a list of strings
+    fsymbols = [ ]
+    ssymbols = [ ]
+    for f in tsymbols:
+        if isinstance(ldict[f],types.FunctionType):
+            fsymbols.append(ldict[f])
+        elif isinstance(ldict[f],types.StringType):
+            ssymbols.append((f,ldict[f]))
+        else:
+            print "lex: %s not defined as a function or string" % f
+            error = 1
+            
+    # Sort the functions by line number
+    fsymbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno))
+
+    # Sort the strings by regular expression length
+    ssymbols.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1])))
+    
+    # Check for non-empty symbols
+    if len(fsymbols) == 0 and len(ssymbols) == 0:
+        raise SyntaxError,"lex: no rules of the form t_rulename are defined."
+
+    # Add all of the rules defined with actions first
+    for f in fsymbols:
+        
+        line = f.func_code.co_firstlineno
+        file = f.func_code.co_filename
+        files[file] = None
+
+        if not optimize:
+            if f.func_code.co_argcount > 1:
+                print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__)
+                error = 1
+                continue
+
+            if f.func_code.co_argcount < 1:
+                print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__)
+                error = 1
+                continue
+
+            if f.__name__ == 't_ignore':
+                print "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__)
+                error = 1
+                continue
+        
+        if f.__name__ == 't_error':
+            lexer.lexerrorf = f
+            continue
+
+        if f.__doc__:
+            if not optimize:
+                try:
+                    c = re.compile(f.__doc__, re.VERBOSE)
+                except re.error,e:
+                    print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e)
+                    error = 1
+                    continue
+
+                if debug:
+                    print "lex: Adding rule %s -> '%s'" % (f.__name__,f.__doc__)
+
+            # Okay. The regular expression seemed okay.  Let's append it to the master regular
+            # expression we're building
+  
+            if (regex): regex += "|"
+            regex += "(?P<%s>%s)" % (f.__name__,f.__doc__)
+        else:
+            print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__)
+
+    # Now add all of the simple rules
+    for name,r in ssymbols:
+
+        if name == 't_ignore':
+            lexer.lexignore = r
+            continue
+        
+        if not optimize:
+            if name == 't_error':
+                raise SyntaxError,"lex: Rule 't_error' must be defined as a function"
+                error = 1
+                continue
+        
+            if not lexer.lextokens.has_key(name[2:]):
+                print "lex: Rule '%s' defined for an unspecified token %s." % (name,name[2:])
+                error = 1
+                continue
+            try:
+                c = re.compile(r,re.VERBOSE)
+            except re.error,e:
+                print "lex: Invalid regular expression for rule '%s'. %s" % (name,e)
+                error = 1
+                continue
+            if debug:
+                print "lex: Adding rule %s -> '%s'" % (name,r)
+                
+        if regex: regex += "|"
+        regex += "(?P<%s>%s)" % (name,r)
+
+    if not optimize:
+        for f in files.keys():
+            if not validate_file(f):
+                error = 1
+    try:
+        if debug:
+            print "lex: regex = '%s'" % regex
+        lexer.lexre = re.compile(regex, re.VERBOSE)
+
+        # Build the index to function map for the matching engine
+        lexer.lexindexfunc = [ None ] * (max(lexer.lexre.groupindex.values())+1)
+        for f,i in lexer.lexre.groupindex.items():
+            handle = ldict[f]
+            if isinstance(handle,types.FunctionType):
+                lexer.lexindexfunc[i] = (handle,handle.__name__[2:])
+            else:
+                # If rule was specified as a string, we build an anonymous
+                # callback function to carry out the action
+                lexer.lexindexfunc[i] = (None,f[2:])
+
+        # If a lextab was specified, we create a file containing the precomputed
+        # regular expression and index table
+        
+        if lextab and optimize:
+            lt = open(lextab+".py","w")
+            lt.write("# %s.py.  This file automatically created by PLY. Don't edit.\n" % lextab)
+            lt.write("_lexre = %s\n" % repr(regex))
+            lt.write("_lextab = [\n");
+            for i in range(0,len(lexer.lexindexfunc)):
+                t = lexer.lexindexfunc[i]
+                if t:
+                    if t[0]:
+                        lt.write("  ('%s',%s),\n"% (t[0].__name__, repr(t[1])))
+                    else:
+                        lt.write("  (None,%s),\n" % repr(t[1]))
+                else:
+                    lt.write("  None,\n")
+                    
+            lt.write("]\n");
+            lt.write("_lextokens = %s\n" % repr(lexer.lextokens))
+            lt.write("_lexignore = %s\n" % repr(lexer.lexignore))
+            if (lexer.lexerrorf):
+                lt.write("_lexerrorf = %s\n" % repr(lexer.lexerrorf.__name__))
+            else:
+                lt.write("_lexerrorf = None\n")
+            lt.close()
+        
+    except re.error,e:
+        print "lex: Fatal error. Unable to compile regular expression rules. %s" % e
+        error = 1
+    if error:
+        raise SyntaxError,"lex: Unable to build lexer."
+    if not lexer.lexerrorf:
+        print "lex: Warning. no t_error rule is defined."
+
+    if not lexer.lexignore: lexer.lexignore = ""
+    
+    # Create global versions of the token() and input() functions
+    token = lexer.token
+    input = lexer.input
+    
+    return lexer
+
+# -----------------------------------------------------------------------------
+# run()
+#
+# This runs the lexer as a main program
+# -----------------------------------------------------------------------------
+
+def runmain(lexer=None,data=None):
+    if not data:
+        try:
+            filename = sys.argv[1]
+            f = open(filename)
+            data = f.read()
+            f.close()
+        except IndexError:
+            print "Reading from standard input (type EOF to end):"
+            data = sys.stdin.read()
+
+    if lexer:
+        _input = lexer.input
+    else:
+        _input = input
+    _input(data)
+    if lexer:
+        _token = lexer.token
+    else:
+        _token = token
+        
+    while 1:
+        tok = _token()
+        if not tok: break
+        print "(%s,'%s',%d)" % (tok.type, tok.value, tok.lineno)
+        
+    
+
+
author	Ronnie Sahlberg <ronnie_sahlberg@ozemail.com.au>	2004-05-24 08:33:09 +0000
committer	Ronnie Sahlberg <ronnie_sahlberg@ozemail.com.au>	2004-05-24 08:33:09 +0000
commit	67e156bb57bdffc6b4e28155e1a464d365315735 (patch)
tree	d8cb2df39b97cb21ffb26a300305e8a2ecd2de17 /tools/lex.py
parent	911bad80f01e1a526cb5db6a1c8b67224a0eb6fd (diff)
download	wireshark-67e156bb57bdffc6b4e28155e1a464d365315735.tar.gz