summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Wu <peter@lekensteyn.nl>2016-05-08 17:20:39 +0200
committerPeter Wu <peter@lekensteyn.nl>2016-05-08 17:20:39 +0200
commit198446b831e6e54b7f485733f594cf24822486be (patch)
treeb652138b0ad0a5b1b7ea1f475e7996ec97182dba
parent36fb690d33b1f487a8d299470cfe9c5d900c58a3 (diff)
downloadRegexTest-198446b831e6e54b7f485733f594cf24822486be.tar.gz
PicoRec: add basic parser plumbing for our language
Add a next() function that returns the next scanned token and match() to consume the token from input. next() takes an automaton such that you can match "begin" in the beginning, but any other string (including "begin" prefixes) for ID.
-rw-r--r--src/PicoRec.java101
1 files changed, 96 insertions, 5 deletions
diff --git a/src/PicoRec.java b/src/PicoRec.java
index eec2f38..f601ce3 100644
--- a/src/PicoRec.java
+++ b/src/PicoRec.java
@@ -11,11 +11,38 @@ public class PicoRec {
/** Regular expression describing the NAT symbol. */
private final static String natRegex = "0|[1-9][0-9]*";
- private final RunAutomaton idR, natR;
+ /** Regular expression describing keywords (not to be interpreted as ID). */
+ private final static String keywordRegex =
+ literalRegex("begin|declare|end");
+ /** Regular expression matches all tokens accepted by the language (other
+ * than keywords). */
+ private final static String tokenRegex =
+ literalRegex("[\\|,;\\+\\*\\-]|:=|" + idRegex + "|" + natRegex);
+ /** Regular expression matching layout tokens (to be ignored). */
+ private final static String layoutRegex = "[ \n]*";
+
+ /** Indicator for the end of input. This symbol does not occur in the
+ * terminals and non-terminals. */
+ private final static String END_SYMBOL = "$";
+
+ private final RunAutomaton idR, natR, keywordR, tokenR, layoutR;
+
+ /** The current parsing state. */
+ private String input;
+ private int offset;
+
+ /** Auxilliary function to produce a regular expression taking into account
+ * layout considerations for symbols. */
+ private static String literalRegex(String regex) {
+ return layoutRegex + "(" + regex + ")" + layoutRegex;
+ }
public PicoRec() {
idR = new RunAutomaton(new RegExp(idRegex).toAutomaton());
natR = new RunAutomaton(new RegExp(natRegex).toAutomaton());
+ keywordR = new RunAutomaton(new RegExp(keywordRegex).toAutomaton());
+ tokenR = new RunAutomaton(new RegExp(tokenRegex).toAutomaton());
+ layoutR = new RunAutomaton(new RegExp(layoutRegex).toAutomaton());
}
/**
@@ -24,8 +51,71 @@ public class PicoRec {
* @return true if the language is recognized, false otherwise.
*/
public boolean parse(String text) {
- // TODO implement this
- return false;
+ input = text;
+ offset = 0;
+ try {
+ // TODO implement this
+ return true;
+ } catch (ParseError ex) {
+ System.err.println("Parsing failed due to: " + ex);
+ return false;
+ }
+ }
+
+ /** Skip layout characters. */
+ private void skipLayout() {
+ int layoutLength = layoutR.run(input, offset);
+ assert(layoutLength >= 0);
+ offset += layoutLength;
+ }
+
+ /** Returns the next token as recognized by the scanner or END_SYMBOL if the
+ * end of input is reached. Subsequent calls will always return the same
+ * value when match() is not called.
+ *
+ * @throws ParseError on unrecognized tokens.
+ */
+ private String next(RunAutomaton r) {
+ String symbol = END_SYMBOL;
+ // "In context-free syntax, layout is allowed between symbols in the
+ // left-hand side of the productions, by automatically inserting
+ // optional layout (e.g. LAYOUT?) between them"
+ skipLayout();
+
+ // match actual symbol.
+ int length = r.run(input, offset);
+ if (length >= 0) {
+ symbol = input.substring(offset, offset + length);
+ }
+
+ // if no symbol matched and this is not the end of string, fail
+ if (length < 0 && offset < input.length()) {
+ throw new ParseError("Unrecognized non-token");
+ }
+ return symbol;
+ }
+
+ /** Consumes the symbol and advanced the input pointer. */
+ private void match(String symbol) {
+ skipLayout();
+ if (END_SYMBOL.equals(symbol)) {
+ if (offset != input.length()) {
+ throw new ParseError("Cannot match end!");
+ }
+ } else {
+ if (!input.startsWith(symbol, offset)) {
+ throw new ParseError("Cannot match symbol!");
+ }
+ offset += symbol.length();
+ }
+ }
+
+ /** Exception that is raised for parser errors. */
+ private static class ParseError extends RuntimeException {
+ private static final long serialVersionUID = 1L;
+ ParseError(String message) {
+ super(message);
+ }
}
@@ -57,13 +147,14 @@ public class PicoRec {
// minimal program satisfying the language, without whitespace.
rec.assertOk("begindeclare|end");
// simple expression
- rec.assertOk("begin declare a | a := 1 end");
+ rec.assertOk("begin declare a | a := 1; end");
- // Trivial failures
+ // Failure cases
rec.assertFail("");
rec.assertFail("begin");
rec.assertFail("begin declare");
rec.assertFail("begin declare |");
+ rec.assertFail("begin declare a | a := 1 end");
rec.assertFail("begin declare | end end");
rec.assertFail("begin declare end");
rec.assertFail("begin end");