/** * Recognizer for the language described by grammar-ll1.txt. */ import dk.brics.automaton.RegExp; import dk.brics.automaton.RunAutomaton; public class PicoRec { /** Regular expression describing the ID symbol. */ private final static String idRegex = "[a-z][a-z0-9]*"; /** Regular expression describing the NAT symbol. */ private final static String natRegex = "0|[1-9][0-9]*"; /** Regular expression describing keywords (not to be interpreted as ID). */ private final static String keywordRegex = literalRegex("begin|declare|end"); /** Regular expression matches all tokens accepted by the language (other * than keywords). */ private final static String tokenRegex = literalRegex("[\\|,;\\+\\*\\-]|:=|" + idRegex + "|" + natRegex); /** Regular expression matching layout tokens (to be ignored). */ private final static String layoutRegex = "[ \n]*"; /** Indicator for the end of input. This symbol does not occur in the * terminals and non-terminals. */ private final static String END_SYMBOL = "$"; private final RunAutomaton idR, natR, keywordR, tokenR, layoutR; /** The current parsing state. */ private String input; private int offset; /** Auxilliary function to produce a regular expression taking into account * layout considerations for symbols. */ private static String literalRegex(String regex) { return layoutRegex + "(" + regex + ")" + layoutRegex; } public PicoRec() { idR = new RunAutomaton(new RegExp(idRegex).toAutomaton()); natR = new RunAutomaton(new RegExp(natRegex).toAutomaton()); keywordR = new RunAutomaton(new RegExp(keywordRegex).toAutomaton()); tokenR = new RunAutomaton(new RegExp(tokenRegex).toAutomaton()); layoutR = new RunAutomaton(new RegExp(layoutRegex).toAutomaton()); } /** * Try to parse the input. * * @throws ParseError if the language is not recognized. */ public void parse(String text) { input = text; offset = 0; } /** Skip layout characters. */ private void skipLayout() { int layoutLength = layoutR.run(input, offset); assert(layoutLength >= 0); offset += layoutLength; } /** Returns the next token as recognized by the scanner or END_SYMBOL if the * end of input is reached. Subsequent calls will always return the same * value when match() is not called. * * @throws ParseError on unrecognized tokens. */ private String next(RunAutomaton r) { String symbol = END_SYMBOL; // "In context-free syntax, layout is allowed between symbols in the // left-hand side of the productions, by automatically inserting // optional layout (e.g. LAYOUT?) between them" skipLayout(); // match actual symbol. int length = r.run(input, offset); if (length >= 0) { symbol = input.substring(offset, offset + length); } // if no symbol matched and this is not the end of string, fail if (length < 0 && offset < input.length()) { throw new ParseError("Unrecognized non-token"); } return symbol; } /** Consumes the symbol and advanced the input pointer. */ private void match(String symbol) { skipLayout(); if (END_SYMBOL.equals(symbol)) { if (offset != input.length()) { throw new ParseError("Cannot match end!"); } } else { if (!input.startsWith(symbol, offset)) { throw new ParseError("Cannot match symbol!"); } offset += symbol.length(); } } /** Exception that is raised for parser errors. */ public static class ParseError extends RuntimeException { private static final long serialVersionUID = 1L; private ParseError(String message) { super(message); } } /* Convenience functions for testing. */ private void check(String input, boolean expectOk) { System.out.println("Testing input: " + input); try { parse(input); if (!expectOk) { throw new RuntimeException("Unexpected pass for " + input); } } catch (ParseError ex) { if (expectOk) { throw new RuntimeException(ex); } } } private void check(String[] input, boolean expectOk) { check(String.join("\n", input), expectOk); } private void assertOk(String... input) { check(input, true); } private void assertFail(String... input) { check(input, false); } public static void main(String[] args) { PicoRec rec = new PicoRec(); // minimal program satisfying the language, without whitespace. rec.assertOk("begindeclare|end"); // simple expression rec.assertOk("begin declare a | a := 1; end"); // Failure cases rec.assertFail(""); rec.assertFail("begin"); rec.assertFail("begin declare"); rec.assertFail("begin declare |"); rec.assertFail("begin declare a | a := 1 end"); rec.assertFail("begin declare | end end"); rec.assertFail("begin declare end"); rec.assertFail("begin end"); rec.assertFail("end"); } }