summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Wu <peter@lekensteyn.nl>2015-03-31 18:46:28 +0200
committerPeter Wu <peter@lekensteyn.nl>2015-03-31 18:46:28 +0200
commitecce7ed7b203ac6937df1ff705cd4e14217cb3ca (patch)
treea0a42df57e8cf7f958f4c50a3f8481162fb048fa
parentf28f52b09dcbe1d9c2416f31920f8844fc10e3a3 (diff)
downloadassignment4-ecce7ed7b203ac6937df1ff705cd4e14217cb3ca.tar.gz
Tests and implementation for getCandidateWords
SpellCorrector.ALPHABET is made static to allow for the test.
-rw-r--r--spellchecker/src/SpellCorrector.java42
-rw-r--r--spellchecker/test/SpellCorrectorTest.java118
2 files changed, 155 insertions, 5 deletions
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index c8451bf..c0d7818 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -6,7 +6,7 @@ public class SpellCorrector {
final private CorpusReader cr;
final private ConfusionMatrixReader cmr;
- final char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz'".toCharArray();
+ final static char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz'".toCharArray();
public SpellCorrector(CorpusReader cr, ConfusionMatrixReader cmr) {
this.cr = cr;
@@ -35,12 +35,44 @@ public class SpellCorrector {
return 0.0;
}
+ /**
+ * Gets all candidate words, resulting from a single addition, deletion or
+ * substitution.
+ *
+ * @param word The word to find candidates for.
+ * @return A set of words with candidate words.
+ */
public HashSet<String> getCandidateWords(String word) {
- HashSet<String> ListOfWords = new HashSet<String>();
+ HashSet<String> ListOfWords = new HashSet<>();
- /**
- * CODE TO BE ADDED *
- */
+ // generate words by insertion of a character
+ for (int i = 0; i <= word.length(); i++) {
+ // the word is split into [0..i] [i..n]
+ // if i == word.length, then the last part is empty
+ String head = word.substring(0, i);
+ String tail = i < word.length() ? word.substring(i) : "";
+ for (char c : ALPHABET) {
+ // insertion of a single character
+ ListOfWords.add(head + c + tail);
+ }
+ }
+
+ for (int i = 0; i < word.length(); i++) {
+ // the word is split into [0..i] [i..i+1] [i+1..n]
+ // if i == word.length() - 1, then the tail is empty.
+ String head = word.substring(0, i);
+ String tail = word.substring(i + 1);
+
+ for (char c : ALPHABET) {
+ // substitution
+ ListOfWords.add(head + c + tail);
+ }
+
+ // deletion of a single character (prevent adding empty words)
+ if (word.length() > 1) {
+ ListOfWords.add(head + tail);
+ }
+ }
return cr.inVocabulary(ListOfWords);
}
}
diff --git a/spellchecker/test/SpellCorrectorTest.java b/spellchecker/test/SpellCorrectorTest.java
new file mode 100644
index 0000000..dbea0ab
--- /dev/null
+++ b/spellchecker/test/SpellCorrectorTest.java
@@ -0,0 +1,118 @@
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+import org.junit.Test;
+import static org.junit.Assert.*;
+import org.junit.Before;
+
+/**
+ * @author Peter Wu
+ */
+public class SpellCorrectorTest {
+
+ private CorpusReader fullCR;
+
+ @Before
+ public void setUp() throws IOException {
+ fullCR = new MockCorpusReader();
+ }
+
+ private void checkGetCandidateWords(CorpusReader cr, String word,
+ Set<String> expResult)
+ throws IOException {
+ System.out.println("getCandidateWords(" + word + ")");
+ SpellCorrector instance = new SpellCorrector(cr, null);
+ HashSet<String> result = instance.getCandidateWords(word);
+ // quick check: are the results of the same size?
+ assertEquals(expResult.size(), result.size());
+ // verbose test: are all letters as expected?
+ assertEquals(expResult, result);
+ }
+
+ @Test
+ public void testGetCandidateWords0() throws IOException {
+ Set<String> words = new HashSet<>();
+
+ // test for empty word (only a letter can be inserted)
+ for (char c : SpellCorrector.ALPHABET) {
+ // insertion
+ words.add("" + c);
+ }
+ checkGetCandidateWords(fullCR, "", words);
+ }
+
+ @Test
+ public void testGetCandidateWords1() throws IOException {
+ Set<String> words = new HashSet<>();
+
+ // test for a single letter word
+ for (char c : SpellCorrector.ALPHABET) {
+ // insertion before and after
+ words.add(c + "a");
+ words.add("a" + c);
+ // substitution should replace the letter.
+ words.add("" + c);
+ }
+ // deletion should not yield a string
+ //words.add("");
+ checkGetCandidateWords(fullCR, "a", words);
+ }
+
+ @Test
+ public void testGetCandidateWords2() throws IOException {
+ Set<String> words = new HashSet<>();
+ // test for a two letter word
+ for (char c : SpellCorrector.ALPHABET) {
+ // insertion before and after
+ words.add(c + "up");
+ words.add("u" + c + "p");
+ words.add("up" + c);
+ // substitution should replace the letter.
+ words.add(c + "p");
+ words.add("u" + c);
+ }
+ // deletion
+ words.add("p");
+ words.add("u");
+ checkGetCandidateWords(fullCR, "up", words);
+ }
+
+ @Test
+ public void testGetCandidateWords3() throws IOException {
+ Set<String> words = new HashSet<>();
+ // test for a three letter word
+ for (char c : SpellCorrector.ALPHABET) {
+ // insertion before and after
+ words.add(c + "ups");
+ words.add("u" + c + "ps");
+ words.add("up" + c + "s");
+ words.add("ups" + c);
+ // substitution should replace the letter.
+ words.add(c + "ps");
+ words.add("u" + c + "s");
+ words.add("up" + c);
+ }
+ // deletion
+ words.add("ps");
+ words.add("us");
+ words.add("up");
+ checkGetCandidateWords(fullCR, "ups", words);
+ }
+
+ /**
+ * Fake CorpusReader which tests whether getCandidateWords can produce all
+ * words.
+ */
+ private class MockCorpusReader extends CorpusReader {
+
+ public MockCorpusReader() throws IOException {
+ super();
+ }
+
+ @Override
+ public HashSet<String> inVocabulary(Set<String> set) {
+ return new HashSet<>(set);
+ }
+ }
+}