summaryrefslogtreecommitdiff
path: root/tools/lex.py
diff options
context:
space:
mode:
authorRonnie Sahlberg <ronnie_sahlberg@ozemail.com.au>2004-05-24 08:33:09 +0000
committerRonnie Sahlberg <ronnie_sahlberg@ozemail.com.au>2004-05-24 08:33:09 +0000
commit67e156bb57bdffc6b4e28155e1a464d365315735 (patch)
treed8cb2df39b97cb21ffb26a300305e8a2ecd2de17 /tools/lex.py
parent911bad80f01e1a526cb5db6a1c8b67224a0eb6fd (diff)
downloadwireshark-67e156bb57bdffc6b4e28155e1a464d365315735.tar.gz
From Tomas Kukosa
ASN2ETH compiler and support script for lexical and syntactic analysis. Will later be used for all those ASN.1 protocols we havent implemented yet svn path=/trunk/; revision=10983
Diffstat (limited to 'tools/lex.py')
-rw-r--r--tools/lex.py681
1 files changed, 681 insertions, 0 deletions
diff --git a/tools/lex.py b/tools/lex.py
new file mode 100644
index 0000000000..7033e8355a
--- /dev/null
+++ b/tools/lex.py
@@ -0,0 +1,681 @@
+#-----------------------------------------------------------------------------
+# ply: lex.py
+#
+# Author: David M. Beazley (beazley@cs.uchicago.edu)
+# Department of Computer Science
+# University of Chicago
+# Chicago, IL 60637
+#
+# Copyright (C) 2001, David M. Beazley
+#
+# $Header: /svn/cvsroot/ethereal/tools/lex.py,v 1.1 2004/05/24 08:33:09 sahlberg Exp $
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#
+# See the file COPYING for a complete copy of the LGPL.
+#
+#
+# This module automatically constructs a lexical analysis module from regular
+# expression rules defined in a user-defined module. The idea is essentially the same
+# as that used in John Aycock's Spark framework, but the implementation works
+# at the module level rather than requiring the use of classes.
+#
+# This module tries to provide an interface that is closely modeled after
+# the traditional lex interface in Unix. It also differs from Spark
+# in that:
+#
+# - It provides more extensive error checking and reporting if
+# the user supplies a set of regular expressions that can't
+# be compiled or if there is any other kind of a problem in
+# the specification.
+#
+# - The interface is geared towards LALR(1) and LR(1) parser
+# generators. That is tokens are generated one at a time
+# rather than being generated in advanced all in one step.
+#
+# There are a few limitations of this module
+#
+# - The module interface makes it somewhat awkward to support more
+# than one lexer at a time. Although somewhat inelegant from a
+# design perspective, this is rarely a practical concern for
+# most compiler projects.
+#
+# - The lexer requires that the entire input text be read into
+# a string before scanning. I suppose that most machines have
+# enough memory to make this a minor issues, but it makes
+# the lexer somewhat difficult to use in interactive sessions
+# or with streaming data.
+#
+#-----------------------------------------------------------------------------
+
+r"""
+lex.py
+
+This module builds lex-like scanners based on regular expression rules.
+To use the module, simply write a collection of regular expression rules
+and actions like this:
+
+# lexer.py
+import lex
+
+# Define a list of valid tokens
+tokens = (
+ 'IDENTIFIER', 'NUMBER', 'PLUS', 'MINUS'
+ )
+
+# Define tokens as functions
+def t_IDENTIFIER(t):
+ r' ([a-zA-Z_](\w|_)* '
+ return t
+
+def t_NUMBER(t):
+ r' \d+ '
+ return t
+
+# Some simple tokens with no actions
+t_PLUS = r'\+'
+t_MINUS = r'-'
+
+# Initialize the lexer
+lex.lex()
+
+The tokens list is required and contains a complete list of all valid
+token types that the lexer is allowed to produce. Token types are
+restricted to be valid identifiers. This means that 'MINUS' is a valid
+token type whereas '-' is not.
+
+Rules are defined by writing a function with a name of the form
+t_rulename. Each rule must accept a single argument which is
+a token object generated by the lexer. This token has the following
+attributes:
+
+ t.type = type string of the token. This is initially set to the
+ name of the rule without the leading t_
+ t.value = The value of the lexeme.
+ t.lineno = The value of the line number where the token was encountered
+
+For example, the t_NUMBER() rule above might be called with the following:
+
+ t.type = 'NUMBER'
+ t.value = '42'
+ t.lineno = 3
+
+Each rule returns the token object it would like to supply to the
+parser. In most cases, the token t is returned with few, if any
+modifications. To discard a token for things like whitespace or
+comments, simply return nothing. For instance:
+
+def t_whitespace(t):
+ r' \s+ '
+ pass
+
+For faster lexing, you can also define this in terms of the ignore set like this:
+
+t_ignore = ' \t'
+
+The characters in this string are ignored by the lexer. Use of this feature can speed
+up parsing significantly since scanning will immediately proceed to the next token.
+
+lex requires that the token returned by each rule has an attribute
+t.type. Other than this, rules are free to return any kind of token
+object that they wish and may construct a new type of token object
+from the attributes of t (provided the new object has the required
+type attribute).
+
+If illegal characters are encountered, the scanner executes the
+function t_error(t) where t is a token representing the rest of the
+string that hasn't been matched. If this function isn't defined, a
+LexError exception is raised. The .text attribute of this exception
+object contains the part of the string that wasn't matched.
+
+The t.skip(n) method can be used to skip ahead n characters in the
+input stream. This is usually only used in the error handling rule.
+For instance, the following rule would print an error message and
+continue:
+
+def t_error(t):
+ print "Illegal character in input %s" % t.value[0]
+ t.skip(1)
+
+Of course, a nice scanner might wish to skip more than one character
+if the input looks very corrupted.
+
+The lex module defines a t.lineno attribute on each token that can be used
+to track the current line number in the input. The value of this
+variable is not modified by lex so it is up to your lexer module
+to correctly update its value depending on the lexical properties
+of the input language. To do this, you might write rules such as
+the following:
+
+def t_newline(t):
+ r' \n+ '
+ t.lineno += t.value.count("\n")
+
+To initialize your lexer so that it can be used, simply call the lex.lex()
+function in your rule file. If there are any errors in your
+specification, warning messages or an exception will be generated to
+alert you to the problem.
+
+(dave: this needs to be rewritten)
+To use the newly constructed lexer from another module, simply do
+this:
+
+ import lex
+ import lexer
+ plex.input("position = initial + rate*60")
+
+ while 1:
+ token = plex.token() # Get a token
+ if not token: break # No more tokens
+ ... do whatever ...
+
+Assuming that the module 'lexer' has initialized plex as shown
+above, parsing modules can safely import 'plex' without having
+to import the rule file or any additional imformation about the
+scanner you have defined.
+"""
+
+# -----------------------------------------------------------------------------
+
+
+__version__ = "1.3"
+
+import re, types, sys, copy
+
+# Exception thrown when invalid token encountered and no default
+class LexError(Exception):
+ def __init__(self,message,s):
+ self.args = (message,)
+ self.text = s
+
+# Token class
+class LexToken:
+ def __str__(self):
+ return "LexToken(%s,%r,%d)" % (self.type,self.value,self.lineno)
+ def __repr__(self):
+ return str(self)
+ def skip(self,n):
+ try:
+ self._skipn += n
+ except AttributeError:
+ self._skipn = n
+
+# -----------------------------------------------------------------------------
+# Lexer class
+#
+# input() - Store a new string in the lexer
+# token() - Get the next token
+# -----------------------------------------------------------------------------
+
+class Lexer:
+ def __init__(self):
+ self.lexre = None # Master regular expression
+ self.lexdata = None # Actual input data (as a string)
+ self.lexpos = 0 # Current position in input text
+ self.lexlen = 0 # Length of the input text
+ self.lexindexfunc = [ ] # Reverse mapping of groups to functions and types
+ self.lexerrorf = None # Error rule (if any)
+ self.lextokens = None # List of valid tokens
+ self.lexignore = None # Ignored characters
+ self.lineno = 1 # Current line number
+ self.debug = 0 # Debugging mode
+ self.optimize = 0 # Optimized mode
+ self.token = self.errtoken
+
+ def __copy__(self):
+ c = Lexer()
+ c.lexre = self.lexre
+ c.lexdata = self.lexdata
+ c.lexpos = self.lexpos
+ c.lexlen = self.lexlen
+ c.lenindexfunc = self.lexindexfunc
+ c.lexerrorf = self.lexerrorf
+ c.lextokens = self.lextokens
+ c.lexignore = self.lexignore
+ c.lineno = self.lineno
+ c.optimize = self.optimize
+ c.token = c.realtoken
+
+ # ------------------------------------------------------------
+ # input() - Push a new string into the lexer
+ # ------------------------------------------------------------
+ def input(self,s):
+ if not isinstance(s,types.StringType):
+ raise ValueError, "Expected a string"
+ self.lexdata = s
+ self.lexpos = 0
+ self.lexlen = len(s)
+ self.token = self.realtoken
+
+ # Change the token routine to point to realtoken()
+ global token
+ if token == self.errtoken:
+ token = self.token
+
+ # ------------------------------------------------------------
+ # errtoken() - Return error if token is called with no data
+ # ------------------------------------------------------------
+ def errtoken(self):
+ raise RuntimeError, "No input string given with input()"
+
+ # ------------------------------------------------------------
+ # token() - Return the next token from the Lexer
+ #
+ # Note: This function has been carefully implemented to be as fast
+ # as possible. Don't make changes unless you really know what
+ # you are doing
+ # ------------------------------------------------------------
+ def realtoken(self):
+ # Make local copies of frequently referenced attributes
+ lexpos = self.lexpos
+ lexlen = self.lexlen
+ lexignore = self.lexignore
+ lexdata = self.lexdata
+
+ while lexpos < lexlen:
+ # This code provides some short-circuit code for whitespace, tabs, and other ignored characters
+ if lexdata[lexpos] in lexignore:
+ lexpos += 1
+ continue
+
+ # Look for a regular expression match
+ m = self.lexre.match(lexdata,lexpos)
+ if m:
+ i = m.lastindex
+ lexpos = m.end()
+ tok = LexToken()
+ tok.value = m.group()
+ tok.lineno = self.lineno
+ tok.lexer = self
+ func,tok.type = self.lexindexfunc[i]
+ if not func:
+ self.lexpos = lexpos
+ return tok
+
+ # If token is processed by a function, call it
+ self.lexpos = lexpos
+ newtok = func(tok)
+ self.lineno = tok.lineno # Update line number
+
+ # Every function must return a token, if nothing, we just move to next token
+ if not newtok: continue
+
+ # Verify type of the token. If not in the token map, raise an error
+ if not self.optimize:
+ if not self.lextokens.has_key(newtok.type):
+ raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
+ func.func_code.co_filename, func.func_code.co_firstlineno,
+ func.__name__, newtok.type),lexdata[lexpos:])
+
+ return newtok
+
+ # No match. Call t_error() if defined.
+ if self.lexerrorf:
+ tok = LexToken()
+ tok.value = self.lexdata[lexpos:]
+ tok.lineno = self.lineno
+ tok.type = "error"
+ tok.lexer = self
+ oldpos = lexpos
+ newtok = self.lexerrorf(tok)
+ lexpos += getattr(tok,"_skipn",0)
+ if oldpos == lexpos:
+ # Error method didn't change text position at all. This is an error.
+ self.lexpos = lexpos
+ raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])
+ if not newtok: continue
+ self.lexpos = lexpos
+ return newtok
+
+ self.lexpos = lexpos
+ raise LexError, ("No match found", lexdata[lexpos:])
+
+ # No more input data
+ self.lexpos = lexpos + 1
+ return None
+
+
+# -----------------------------------------------------------------------------
+# validate_file()
+#
+# This checks to see if there are duplicated t_rulename() functions or strings
+# in the parser input file. This is done using a simple regular expression
+# match on each line in the filename.
+# -----------------------------------------------------------------------------
+
+def validate_file(filename):
+ import os.path
+ base,ext = os.path.splitext(filename)
+ if ext != '.py': return 1 # No idea what the file is. Return OK
+
+ try:
+ f = open(filename)
+ lines = f.readlines()
+ f.close()
+ except IOError:
+ return 1 # Oh well
+
+ fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
+ sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=')
+ counthash = { }
+ linen = 1
+ noerror = 1
+ for l in lines:
+ m = fre.match(l)
+ if not m:
+ m = sre.match(l)
+ if m:
+ name = m.group(1)
+ prev = counthash.get(name)
+ if not prev:
+ counthash[name] = linen
+ else:
+ print "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev)
+ noerror = 0
+ linen += 1
+ return noerror
+
+# -----------------------------------------------------------------------------
+# _read_lextab(module)
+#
+# Reads lexer table from a lextab file instead of using introspection.
+# -----------------------------------------------------------------------------
+
+def _read_lextab(lexer, fdict, module):
+ exec "import %s as lextab" % module
+ lexer.lexre = re.compile(lextab._lexre, re.VERBOSE)
+ lexer.lexindexfunc = lextab._lextab
+ for i in range(len(lextab._lextab)):
+ t = lexer.lexindexfunc[i]
+ if t:
+ if t[0]:
+ lexer.lexindexfunc[i] = (fdict[t[0]],t[1])
+ lexer.lextokens = lextab._lextokens
+ lexer.lexignore = lextab._lexignore
+ if lextab._lexerrorf:
+ lexer.lexerrorf = fdict[lextab._lexerrorf]
+
+# -----------------------------------------------------------------------------
+# lex(module)
+#
+# Build all of the regular expression rules from definitions in the supplied module
+# -----------------------------------------------------------------------------
+def lex(module=None,debug=0,optimize=0,lextab="lextab"):
+ ldict = None
+ regex = ""
+ error = 0
+ files = { }
+ lexer = Lexer()
+ lexer.debug = debug
+ lexer.optimize = optimize
+ global token,input
+
+ if module:
+ if not isinstance(module, types.ModuleType):
+ raise ValueError,"Expected a module"
+
+ ldict = module.__dict__
+
+ else:
+ # No module given. We might be able to get information from the caller.
+ try:
+ raise RuntimeError
+ except RuntimeError:
+ e,b,t = sys.exc_info()
+ f = t.tb_frame
+ f = f.f_back # Walk out to our calling function
+ ldict = f.f_globals # Grab its globals dictionary
+
+ if optimize and lextab:
+ try:
+ _read_lextab(lexer,ldict, lextab)
+ if not lexer.lexignore: lexer.lexignore = ""
+ token = lexer.token
+ input = lexer.input
+ return lexer
+
+ except ImportError:
+ pass
+
+ # Get the tokens map
+ tokens = ldict.get("tokens",None)
+ if not tokens:
+ raise SyntaxError,"lex: module does not define 'tokens'"
+ if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)):
+ raise SyntaxError,"lex: tokens must be a list or tuple."
+
+ # Build a dictionary of valid token names
+ lexer.lextokens = { }
+ if not optimize:
+
+ # Utility function for verifying tokens
+ def is_identifier(s):
+ for c in s:
+ if not (c.isalnum() or c == '_'): return 0
+ return 1
+
+ for n in tokens:
+ if not is_identifier(n):
+ print "lex: Bad token name '%s'" % n
+ error = 1
+ if lexer.lextokens.has_key(n):
+ print "lex: Warning. Token '%s' multiply defined." % n
+ lexer.lextokens[n] = None
+ else:
+ for n in tokens: lexer.lextokens[n] = None
+
+
+ if debug:
+ print "lex: tokens = '%s'" % lexer.lextokens.keys()
+
+ # Get a list of symbols with the t_ prefix
+ tsymbols = [f for f in ldict.keys() if f[:2] == 't_']
+
+ # Now build up a list of functions and a list of strings
+ fsymbols = [ ]
+ ssymbols = [ ]
+ for f in tsymbols:
+ if isinstance(ldict[f],types.FunctionType):
+ fsymbols.append(ldict[f])
+ elif isinstance(ldict[f],types.StringType):
+ ssymbols.append((f,ldict[f]))
+ else:
+ print "lex: %s not defined as a function or string" % f
+ error = 1
+
+ # Sort the functions by line number
+ fsymbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno))
+
+ # Sort the strings by regular expression length
+ ssymbols.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1])))
+
+ # Check for non-empty symbols
+ if len(fsymbols) == 0 and len(ssymbols) == 0:
+ raise SyntaxError,"lex: no rules of the form t_rulename are defined."
+
+ # Add all of the rules defined with actions first
+ for f in fsymbols:
+
+ line = f.func_code.co_firstlineno
+ file = f.func_code.co_filename
+ files[file] = None
+
+ if not optimize:
+ if f.func_code.co_argcount > 1:
+ print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__)
+ error = 1
+ continue
+
+ if f.func_code.co_argcount < 1:
+ print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__)
+ error = 1
+ continue
+
+ if f.__name__ == 't_ignore':
+ print "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__)
+ error = 1
+ continue
+
+ if f.__name__ == 't_error':
+ lexer.lexerrorf = f
+ continue
+
+ if f.__doc__:
+ if not optimize:
+ try:
+ c = re.compile(f.__doc__, re.VERBOSE)
+ except re.error,e:
+ print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e)
+ error = 1
+ continue
+
+ if debug:
+ print "lex: Adding rule %s -> '%s'" % (f.__name__,f.__doc__)
+
+ # Okay. The regular expression seemed okay. Let's append it to the master regular
+ # expression we're building
+
+ if (regex): regex += "|"
+ regex += "(?P<%s>%s)" % (f.__name__,f.__doc__)
+ else:
+ print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__)
+
+ # Now add all of the simple rules
+ for name,r in ssymbols:
+
+ if name == 't_ignore':
+ lexer.lexignore = r
+ continue
+
+ if not optimize:
+ if name == 't_error':
+ raise SyntaxError,"lex: Rule 't_error' must be defined as a function"
+ error = 1
+ continue
+
+ if not lexer.lextokens.has_key(name[2:]):
+ print "lex: Rule '%s' defined for an unspecified token %s." % (name,name[2:])
+ error = 1
+ continue
+ try:
+ c = re.compile(r,re.VERBOSE)
+ except re.error,e:
+ print "lex: Invalid regular expression for rule '%s'. %s" % (name,e)
+ error = 1
+ continue
+ if debug:
+ print "lex: Adding rule %s -> '%s'" % (name,r)
+
+ if regex: regex += "|"
+ regex += "(?P<%s>%s)" % (name,r)
+
+ if not optimize:
+ for f in files.keys():
+ if not validate_file(f):
+ error = 1
+ try:
+ if debug:
+ print "lex: regex = '%s'" % regex
+ lexer.lexre = re.compile(regex, re.VERBOSE)
+
+ # Build the index to function map for the matching engine
+ lexer.lexindexfunc = [ None ] * (max(lexer.lexre.groupindex.values())+1)
+ for f,i in lexer.lexre.groupindex.items():
+ handle = ldict[f]
+ if isinstance(handle,types.FunctionType):
+ lexer.lexindexfunc[i] = (handle,handle.__name__[2:])
+ else:
+ # If rule was specified as a string, we build an anonymous
+ # callback function to carry out the action
+ lexer.lexindexfunc[i] = (None,f[2:])
+
+ # If a lextab was specified, we create a file containing the precomputed
+ # regular expression and index table
+
+ if lextab and optimize:
+ lt = open(lextab+".py","w")
+ lt.write("# %s.py. This file automatically created by PLY. Don't edit.\n" % lextab)
+ lt.write("_lexre = %s\n" % repr(regex))
+ lt.write("_lextab = [\n");
+ for i in range(0,len(lexer.lexindexfunc)):
+ t = lexer.lexindexfunc[i]
+ if t:
+ if t[0]:
+ lt.write(" ('%s',%s),\n"% (t[0].__name__, repr(t[1])))
+ else:
+ lt.write(" (None,%s),\n" % repr(t[1]))
+ else:
+ lt.write(" None,\n")
+
+ lt.write("]\n");
+ lt.write("_lextokens = %s\n" % repr(lexer.lextokens))
+ lt.write("_lexignore = %s\n" % repr(lexer.lexignore))
+ if (lexer.lexerrorf):
+ lt.write("_lexerrorf = %s\n" % repr(lexer.lexerrorf.__name__))
+ else:
+ lt.write("_lexerrorf = None\n")
+ lt.close()
+
+ except re.error,e:
+ print "lex: Fatal error. Unable to compile regular expression rules. %s" % e
+ error = 1
+ if error:
+ raise SyntaxError,"lex: Unable to build lexer."
+ if not lexer.lexerrorf:
+ print "lex: Warning. no t_error rule is defined."
+
+ if not lexer.lexignore: lexer.lexignore = ""
+
+ # Create global versions of the token() and input() functions
+ token = lexer.token
+ input = lexer.input
+
+ return lexer
+
+# -----------------------------------------------------------------------------
+# run()
+#
+# This runs the lexer as a main program
+# -----------------------------------------------------------------------------
+
+def runmain(lexer=None,data=None):
+ if not data:
+ try:
+ filename = sys.argv[1]
+ f = open(filename)
+ data = f.read()
+ f.close()
+ except IndexError:
+ print "Reading from standard input (type EOF to end):"
+ data = sys.stdin.read()
+
+ if lexer:
+ _input = lexer.input
+ else:
+ _input = input
+ _input(data)
+ if lexer:
+ _token = lexer.token
+ else:
+ _token = token
+
+ while 1:
+ tok = _token()
+ if not tok: break
+ print "(%s,'%s',%d)" % (tok.type, tok.value, tok.lineno)
+
+
+
+