/** * Recognizer for the language described by grammar-ll1.txt. */ import dk.brics.automaton.RegExp; import dk.brics.automaton.RunAutomaton; public class PicoRec { /** Regular expression describing the ID symbol. */ private final static String idRegex = "[a-z][a-z0-9]*"; /** Regular expression describing the NAT symbol. */ private final static String natRegex = "0|[1-9][0-9]*"; /** Regular expression matches all tokens accepted by the language. */ private final static String tokenRegex = "[\\|,;\\+\\*\\-\\(\\)]|:=|" + idRegex + "|" + natRegex; /** Regular expression matching layout tokens (to be ignored). */ private final static String layoutRegex = "[ \n]*"; /** Indicator for the end of input. This symbol does not occur in the * terminals and non-terminals. */ private final static String END_SYMBOL = "$"; private final RunAutomaton idR, natR, tokenR, layoutR; /** The current parsing state. */ private String input; private int offset; public PicoRec() { idR = new RunAutomaton(new RegExp(idRegex).toAutomaton()); natR = new RunAutomaton(new RegExp(natRegex).toAutomaton()); tokenR = new RunAutomaton(new RegExp(tokenRegex).toAutomaton()); layoutR = new RunAutomaton(new RegExp(layoutRegex).toAutomaton()); } /** * Try to parse the input. * * @throws ParseError if the language is not recognized. */ public void parse(String text) { input = text; offset = 0; match("begin"); match("declare"); parseIdList(); match("|"); parseStatements(); match("end"); match(END_SYMBOL); } /** Returns true if the next token is a non-keyword identifier. */ private boolean isId(String token) { boolean result = !("begin".equals(token) || "declare".equals(token) || "end".equals(token)) && idR.run(token); //System.err.println("Testing for ID: " + token + ": " + result); return result; } private void parseIdList() { // Parses IDLIST ::= ID "," | while (isId(next())) { match(next()); // ID match(","); } } private void parseStatements() { // Parses STATEMENTS ::= STATEMENT ";" | // => STATEMENTS ::= ID ":=" EXP ";" | while (isId(next())) { match(next()); // ID match(":="); parseExp(); match(";"); } } private void parseExp() { // Parses EXP ::= PROD-EXP PLUS-EXP' parseProdExp(); parsePlusExpPrefixed(); } private void parsePlusExp() { // Parses PLUS-EXP ::= PROD-EXP PLUS-EXP' parseProdExp(); parsePlusExpPrefixed(); } private void parsePlusExpPrefixed() { // Parses PLUS-EXP' ::= "+" PROD-EXP PLUS-EXP' | while ("+".equals(next())) { match("+"); parseProdExp(); parsePlusExpPrefixed(); } } private void parseProdExp() { // Parses PROD-EXP ::= SINGLE-EXP PROD-EXP' parseSingleExp(); parseProdExpPrefixed(); } private void parseProdExpPrefixed() { // Parses PROD-EXP' ::= "*" SINGLE-EXP PROD-EXP' | while ("*".equals(next())) { match("*"); parseSingleExp(); parseProdExpPrefixed(); } } private void parseSingleExp() { // Parses SINGLE-EXP ::= "-" SINGLE-EXP | ID | NAT | "(" EXP ")" String token = next(); if ("-".equals(token)) { match("-"); parseSingleExp(); } else if (isId(token)) { match(token); } else if (natR.run(token)) { match(token); } else if ("(".equals(token)) { match("("); parseExp(); match(")"); } else { throw new ParseError("Invalid expression"); } } /** Skip layout characters. */ private void skipLayout() { int layoutLength = layoutR.run(input, offset); assert(layoutLength >= 0); offset += layoutLength; } /** Returns the next token as recognized by the scanner or END_SYMBOL if the * end of input is reached. Subsequent calls will always return the same * value when match() is not called. * * @throws ParseError on unrecognized tokens. */ private String next() { String symbol = END_SYMBOL; // "In context-free syntax, layout is allowed between symbols in the // left-hand side of the productions, by automatically inserting // optional layout (e.g. LAYOUT?) between them" skipLayout(); // match actual symbol. int length = tokenR.run(input, offset); if (length >= 0) { symbol = input.substring(offset, offset + length); } // if no symbol matched and this is not the end of string, fail if (length < 0 && offset < input.length()) { throw new ParseError("Unrecognized non-token: " + input.substring(offset)); } return symbol; } /** Consumes the symbol and advanced the input pointer. */ private void match(String symbol) { skipLayout(); if (END_SYMBOL.equals(symbol)) { if (offset != input.length()) { throw new ParseError("Cannot match end!"); } } else { if (!input.startsWith(symbol, offset)) { //System.err.println("Current input: " + input.substring(offset)); throw new ParseError("Cannot match symbol: " + symbol); } offset += symbol.length(); } } /** Exception that is raised for parser errors. */ public static class ParseError extends RuntimeException { private static final long serialVersionUID = 1L; private ParseError(String message) { super(message); } } /* Convenience functions for testing. */ private void check(String input, boolean expectOk) { System.out.println("Testing input: " + input); try { parse(input); if (!expectOk) { throw new RuntimeException("Unexpected pass for " + input); } } catch (ParseError ex) { if (expectOk) { throw new RuntimeException(ex); } } } private void assertOk(String input) { check(input, true); } private void assertFail(String input) { check(input, false); } public static void main(String[] args) { PicoRec rec = new PicoRec(); // minimal program satisfying the language, without whitespace. rec.assertOk("begindeclare|end"); // simple expression rec.assertOk("begin declare a, | a := 1; end"); // other tests rec.assertOk("begindeclarea,|a:=1;end"); rec.assertOk(" begin declare a , | a := 1 ; end \n"); // substrings of keywords in IDs should be accepted rec.assertOk("begin declare a, | a := endname; end"); rec.assertOk("begin declare declarative, | a := 1; end"); rec.assertOk("begin declare abc, | abegin := 1; end"); // basic cases rec.assertOk("begin declare a, | end"); rec.assertOk("begin declare | a := 1; end"); rec.assertOk("begin declare a, | a := -a; end"); rec.assertOk("begin declare a, | a := abc; end"); rec.assertOk("begin declare a, | a := 123; end"); rec.assertOk("begin declare a, | a := (a); end"); // more complicated expressions rec.assertOk("begin declare a, | a := 2 * -3 + abc + def; end"); rec.assertOk("begin declare a,b,c, | a := b + -(c*c*c) + -c; end"); rec.assertOk("begin declare a,b, | a := -a; b := 1+23+456; end"); rec.assertOk("begin declare a, | a := ---((a)); end"); // Failure cases rec.assertFail(""); rec.assertFail("begin"); rec.assertFail("begin declare"); rec.assertFail("begin declare |"); rec.assertFail("begin declare a, | a := ; end"); rec.assertFail("begin declare a | a := 1; end"); rec.assertFail("begin declare a, | a := 1 end"); rec.assertFail("begin declare a,b | a := 1; end"); rec.assertFail("begin declare a, | a := 1;; end"); rec.assertFail("begin declare | end end"); rec.assertFail("begin declare | endx"); rec.assertFail("begin declare end"); rec.assertFail("begin end"); rec.assertFail("end"); // Failures in expression rec.assertFail("begin declare a, | a := -; end"); rec.assertFail("begin declare a, | a := -+1; end"); rec.assertFail("begin declare a, | a := ; end"); rec.assertFail("begin declare a, | a := (a; end"); rec.assertFail("begin declare a, | a := (a)); end"); rec.assertFail("begin declare a, | a := end; end"); rec.assertFail("begin declare a, | a := end; end"); // Keywords should not be used as identifiers rec.assertFail("begin declare begin, | a := 1; end"); rec.assertFail("begin declare a, | declare := 1; end"); rec.assertFail("begin declare a, | a := end; end"); // Whitespace is cannot split identifiers rec.assertFail("begin dec lare abc, | end"); rec.assertFail("begin declare a bc, | end"); // Garbage input rec.assertFail("begin declare A, | end"); rec.assertFail("begin declare a, | end,"); rec.assertFail("begin declare a, | end#"); rec.assertFail("begin declare a, | end$"); } }