/* * Authors: * Peter Wu (0783206) * Andrea Evangelista (0876766) */ package regex; import dk.brics.automaton.RegExp; import dk.brics.automaton.RunAutomaton; public class RegexTest { private final RunAutomaton r; public RegexTest(String regex) { System.out.println("regular expression = " + regex); r = new RunAutomaton(new RegExp(regex).toAutomaton()); } public long dfaMatch(String input, int index) { long start = System.nanoTime(); int length = r.run(input, index); long end = System.nanoTime(); if (length == -1) { System.out.println("No match found!"); } else { String s = input.substring(index, index + length); System.out.println("Found: " + s); } return end - start; } void runTest(String input, int index) { System.out.println("input string = " + input); System.out.println("index = " + index); long dfaMatchTime = dfaMatch(input, index); System.out.println("dfaMatchTime " + dfaMatchTime); } private void check(String input, boolean expectOk) { boolean accepted = r.run(input); System.out.println("Testing input: " + input); if (expectOk && !accepted || !expectOk && accepted) { throw new RuntimeException("Unexpected result for " + input); } } void assertOk(String input) { check(input, true); } void assertFail(String input) { check(input, false); } static void checkId() { RegexTest reId = new RegexTest("[a-z][a-z0-9]*"); reId.assertOk("id"); reId.assertOk("d0d"); reId.assertFail("Id"); reId.assertFail("0"); reId.assertFail("0d"); reId.assertFail("aD"); System.out.println(); } static void checkNAT() { RegexTest reNAT = new RegexTest("0|[1-9][0-9]*"); reNAT.assertOk("0"); reNAT.assertOk("2"); reNAT.assertOk("14451"); reNAT.assertFail("-145711"); reNAT.assertFail("01"); System.out.println(); } static void checkFLOAT() { String UnsignedInt = "([0]|([1-9][0-9]*))"; String SignedInt = "[\\+\\-]?" + UnsignedInt; String UnsignedReal = "(" + UnsignedInt + "\\." + "[0-9]+" + "([eE]" + SignedInt + ")?" + ")"; UnsignedReal += "|(" + UnsignedInt + "[eE]" + SignedInt + ")"; String Number = UnsignedInt + "|" + UnsignedReal; RegexTest reFLOAT = new RegexTest(Number); reFLOAT.assertOk("0"); reFLOAT.assertOk("1"); reFLOAT.assertOk("14"); reFLOAT.assertOk("0.1"); reFLOAT.assertOk("3e4"); reFLOAT.assertOk("3.014e-7"); reFLOAT.assertOk("3.14E-7"); reFLOAT.assertFail("00"); reFLOAT.assertFail("01"); reFLOAT.assertFail("04.1"); reFLOAT.assertFail("3e04"); reFLOAT.assertFail("3.14e-07"); reFLOAT.assertFail(""); reFLOAT.assertFail("e7"); System.out.println(); } static void checkString() { /* String ::= (UnescapedChar | "\" EscapedChar)* * UnescapedChar ::= Char - ["] - "\" * (* All Unicode chars but quote and backslash *) * EscapedChar ::= ["] | "\" */ RegexTest reString = new RegexTest("([^\\\"\\\\]|\\\\[\\\"\\\\])*"); reString.assertOk(""); reString.assertOk("abc"); reString.assertOk("a\\\"b\\\"c"); reString.assertOk("\\\""); reString.assertFail("a\"b\"c"); reString.assertFail("\""); reString.assertFail("\\"); reString.assertFail("\\x"); reString.assertFail("\\\\\""); System.out.println(); } static void checkMatlabComment() { /* EOL ::= #0a (* LF character *) * WSP ::= " " | #09 (* horizontal space and tab *) * CharWithoutEOL ::= Char - EOL * SingleLineComment ::= "%" CharWithoutEOL* * MultiLineComment ::= "%{" WSP* EOL (CharWithoutEOL* EOL)* WSP* "%}" * Comment ::= SingleLineComment | MultiLineComment * * According to the Mathlab docs[1], there can be no other data on the * same line. According to a Wikibooks article[2] and by manual testing * in Mathlab (actually Octave), it turns out that whitespace is allowed * on the same line as "%{" and "%}". We only care about the begin/end * of comment, so excluding any leading and trailing whitespace. * * The EOL is not well-specified, assume LF as that is the format that * seems to be used[3]. * * [1]: http://nl.mathworks.com/help/matlab/matlab_prog/comments.html * [2]: https://en.wikibooks.org/wiki/MATLAB_Programming/Comments * [3]: https://stackoverflow.com/q/6823168 */ String sl = "%[^\n]*"; String ml = "%\\{[ \t]*\n" + "([^\n]*\n)*" + "[ \t]*%\\}"; RegexTest reMatlabComment = new RegexTest("(" + sl + ")|(" + ml + ")"); // single line comments reMatlabComment.assertOk("%"); reMatlabComment.assertOk("%abc"); reMatlabComment.assertOk("% abc"); reMatlabComment.assertOk("%%"); reMatlabComment.assertOk("%% abc"); reMatlabComment.assertFail("%\n"); reMatlabComment.assertFail("%a\n"); reMatlabComment.assertFail("%%\n"); reMatlabComment.assertFail("%a\n%b"); // Single-line comments that could be mistaken for multi-line comments reMatlabComment.assertOk("%{%}"); reMatlabComment.assertOk("%{%}x"); reMatlabComment.assertOk("%{x%}"); reMatlabComment.assertOk("%{%}%}"); // Multi-line comments reMatlabComment.assertOk("%{\n%}"); reMatlabComment.assertOk("%{\n\n%}"); reMatlabComment.assertOk("%{\nxxx\n%}"); reMatlabComment.assertOk("%{\n\nxxx\n%}"); reMatlabComment.assertOk("%{\nxxx\n\n%}"); reMatlabComment.assertOk("%{ \n %}"); reMatlabComment.assertOk("%{ \nx\n \t %}"); reMatlabComment.assertFail("%{x\n%}"); reMatlabComment.assertFail("%{\nx%}"); reMatlabComment.assertFail("%{\n%}x"); reMatlabComment.assertFail("%{ \nx %}"); reMatlabComment.assertFail("%{\n%}%}"); System.out.println(); } static void checkJavaComment() { /* CR ::= #0a * LF ::= #0d * InputCharacter ::= Char - CR - LF * LineTerminator ::= LF | CR | CR LF * NotStar ::= InputCharacter - "*" | LineTerminator * NotStarSlash ::= InputCharacter - "*" - "/" | LineTerminator * * TraditionalComment ::= "/" "*" CommentTail * CommentTail ::= "*" CommentTailStar | NotStar CommentTail * CommentTailStar ::= "/" | "*" CommentTailStar * CommentTailStar ::= NotStarSlash CommentTail * EndOfLineComment ::= "/" "/" InputCharacter* * Comment ::= TraditionalComment | EndOfLineComment * * Note: while the Java Language Specification dictates that * UnicodeEscape tokens are recognized, here it is assumed that this * step is already handled. If not, then all "/", "*", newlines and Char * tokens should be either its respective literal or the UnicodeEscape * that represents this literal. * * https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.7 */ String sl = "//[^\n\r]*"; // Match parts starting with a star, followed by CommentTailStar, but // only where CommentTailStar = "*" CommentTailStar, terminated by // NotStarSlash [CommentTail]. There is a special case in the "ml" // regex for when CommentTailStar does not contain NotStarSlash. String star = "\\*+[^/*]"; String nostar = "[^\\*]"; String ml = "/\\*" + "(" + star + "|" + nostar + ")*\\**" + "\\*/"; RegexTest reJavaComment = new RegexTest(sl + "|" + ml); // Single-line comment reJavaComment.assertOk("//"); reJavaComment.assertOk("// ok"); reJavaComment.assertOk("/// ok//"); reJavaComment.assertFail("//\n"); reJavaComment.assertFail("//\n//"); // Multi-line comments reJavaComment.assertOk("/**/"); reJavaComment.assertOk("/***/"); reJavaComment.assertOk("/* xxx */"); reJavaComment.assertOk("/*\n*/"); reJavaComment.assertOk("/*\r\n*/"); reJavaComment.assertOk("/*/*/"); reJavaComment.assertOk("/*//*/"); reJavaComment.assertOk("/*///*/"); reJavaComment.assertOk("/** */"); reJavaComment.assertOk("/** /*/"); reJavaComment.assertOk("/*xxx\n*/"); reJavaComment.assertOk("/*\nxxx*/"); reJavaComment.assertOk("/*\nxxx\n*/"); reJavaComment.assertOk("/*\nxx\nx\n*/"); reJavaComment.assertFail("/**/ x"); reJavaComment.assertFail("/**///"); reJavaComment.assertFail("/**//*/"); reJavaComment.assertFail("/**//**/"); System.out.println(); } public static void main(String[] args) { checkId(); checkNAT(); checkFLOAT(); checkString(); checkMatlabComment(); checkJavaComment(); System.out.println("Passed."); } }