From ecce7ed7b203ac6937df1ff705cd4e14217cb3ca Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Tue, 31 Mar 2015 18:46:28 +0200 Subject: Tests and implementation for getCandidateWords SpellCorrector.ALPHABET is made static to allow for the test. --- spellchecker/src/SpellCorrector.java | 42 +++++++++-- spellchecker/test/SpellCorrectorTest.java | 118 ++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+), 5 deletions(-) create mode 100644 spellchecker/test/SpellCorrectorTest.java diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index c8451bf..c0d7818 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -6,7 +6,7 @@ public class SpellCorrector { final private CorpusReader cr; final private ConfusionMatrixReader cmr; - final char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz'".toCharArray(); + final static char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz'".toCharArray(); public SpellCorrector(CorpusReader cr, ConfusionMatrixReader cmr) { this.cr = cr; @@ -35,12 +35,44 @@ public class SpellCorrector { return 0.0; } + /** + * Gets all candidate words, resulting from a single addition, deletion or + * substitution. + * + * @param word The word to find candidates for. + * @return A set of words with candidate words. + */ public HashSet getCandidateWords(String word) { - HashSet ListOfWords = new HashSet(); + HashSet ListOfWords = new HashSet<>(); - /** - * CODE TO BE ADDED * - */ + // generate words by insertion of a character + for (int i = 0; i <= word.length(); i++) { + // the word is split into [0..i] [i..n] + // if i == word.length, then the last part is empty + String head = word.substring(0, i); + String tail = i < word.length() ? word.substring(i) : ""; + for (char c : ALPHABET) { + // insertion of a single character + ListOfWords.add(head + c + tail); + } + } + + for (int i = 0; i < word.length(); i++) { + // the word is split into [0..i] [i..i+1] [i+1..n] + // if i == word.length() - 1, then the tail is empty. + String head = word.substring(0, i); + String tail = word.substring(i + 1); + + for (char c : ALPHABET) { + // substitution + ListOfWords.add(head + c + tail); + } + + // deletion of a single character (prevent adding empty words) + if (word.length() > 1) { + ListOfWords.add(head + tail); + } + } return cr.inVocabulary(ListOfWords); } } diff --git a/spellchecker/test/SpellCorrectorTest.java b/spellchecker/test/SpellCorrectorTest.java new file mode 100644 index 0000000..dbea0ab --- /dev/null +++ b/spellchecker/test/SpellCorrectorTest.java @@ -0,0 +1,118 @@ + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; +import org.junit.Test; +import static org.junit.Assert.*; +import org.junit.Before; + +/** + * @author Peter Wu + */ +public class SpellCorrectorTest { + + private CorpusReader fullCR; + + @Before + public void setUp() throws IOException { + fullCR = new MockCorpusReader(); + } + + private void checkGetCandidateWords(CorpusReader cr, String word, + Set expResult) + throws IOException { + System.out.println("getCandidateWords(" + word + ")"); + SpellCorrector instance = new SpellCorrector(cr, null); + HashSet result = instance.getCandidateWords(word); + // quick check: are the results of the same size? + assertEquals(expResult.size(), result.size()); + // verbose test: are all letters as expected? + assertEquals(expResult, result); + } + + @Test + public void testGetCandidateWords0() throws IOException { + Set words = new HashSet<>(); + + // test for empty word (only a letter can be inserted) + for (char c : SpellCorrector.ALPHABET) { + // insertion + words.add("" + c); + } + checkGetCandidateWords(fullCR, "", words); + } + + @Test + public void testGetCandidateWords1() throws IOException { + Set words = new HashSet<>(); + + // test for a single letter word + for (char c : SpellCorrector.ALPHABET) { + // insertion before and after + words.add(c + "a"); + words.add("a" + c); + // substitution should replace the letter. + words.add("" + c); + } + // deletion should not yield a string + //words.add(""); + checkGetCandidateWords(fullCR, "a", words); + } + + @Test + public void testGetCandidateWords2() throws IOException { + Set words = new HashSet<>(); + // test for a two letter word + for (char c : SpellCorrector.ALPHABET) { + // insertion before and after + words.add(c + "up"); + words.add("u" + c + "p"); + words.add("up" + c); + // substitution should replace the letter. + words.add(c + "p"); + words.add("u" + c); + } + // deletion + words.add("p"); + words.add("u"); + checkGetCandidateWords(fullCR, "up", words); + } + + @Test + public void testGetCandidateWords3() throws IOException { + Set words = new HashSet<>(); + // test for a three letter word + for (char c : SpellCorrector.ALPHABET) { + // insertion before and after + words.add(c + "ups"); + words.add("u" + c + "ps"); + words.add("up" + c + "s"); + words.add("ups" + c); + // substitution should replace the letter. + words.add(c + "ps"); + words.add("u" + c + "s"); + words.add("up" + c); + } + // deletion + words.add("ps"); + words.add("us"); + words.add("up"); + checkGetCandidateWords(fullCR, "ups", words); + } + + /** + * Fake CorpusReader which tests whether getCandidateWords can produce all + * words. + */ + private class MockCorpusReader extends CorpusReader { + + public MockCorpusReader() throws IOException { + super(); + } + + @Override + public HashSet inVocabulary(Set set) { + return new HashSet<>(set); + } + } +} -- cgit v1.2.1