From c3a9b5c241e89a6df6a87eb62b5ffea551ed4617 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Tue, 31 Mar 2015 22:24:21 +0200 Subject: Refactor getCandidateWords for ConfusionMatrix --- spellchecker/src/SpellCorrector.java | 70 +++++++++++++++++++++++-------- spellchecker/src/TriFunction.java | 13 ++++++ spellchecker/test/SpellCorrectorTest.java | 5 +++ 3 files changed, 70 insertions(+), 18 deletions(-) create mode 100644 spellchecker/src/TriFunction.java (limited to 'spellchecker') diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index 8ebb829..0456165 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -39,38 +39,70 @@ public class SpellCorrector { * Gets all candidate words, resulting from a single addition, deletion. * substitution or transposition. * - * @param word The word to find candidates for. + * @param word The (wrong) word to find (corrected) candidates for. * @return A set of words with candidate words. */ public HashSet getCandidateWords(String word) { HashSet ListOfWords = new HashSet<>(); + // tries to add word2 to the list of words. This word2 was generated + // from "word" by changing "error" to "correct". + TriFunction push = (word2, error, correct) -> { + if (!cr.inVocabulary(word2)) { + return; + } + + ListOfWords.add(word2); + }; + // generate words by insertion of a character for (int i = 0; i <= word.length(); i++) { + String head, head_last, tail; // the word is split into [0..i] [i..n] // if i == word.length, then the last part is empty - String head = word.substring(0, i); - String tail = i < word.length() ? word.substring(i) : ""; + head = word.substring(0, i); + tail = i < word.length() ? word.substring(i) : ""; + + // |p]en -> [ap]en, [p|en -> [pi]en, + // p[e|n -> p[ei]n, pe[n| -> pe[nm]. for (char c : ALPHABET) { - // insertion of a single character - ListOfWords.add(head + c + tail); + if (i == 0) { + // insert "c" before current letter (i = 0) + head_last = word.substring(0); + push.call(head + c + tail, head_last, c + head_last); + } else { + // insert "c" after current letter (i > 0) + head_last = word.substring(i - 1, i); + push.call(head + c + tail, head_last, head_last + c); + } } } + // by substitution or deletion of a character for (int i = 0; i < word.length(); i++) { + String head, middle, error, tail; // the word is split into [0..i] [i..i+1] [i+1..n] // if i == word.length() - 1, then the tail is empty. - String head = word.substring(0, i); - String tail = word.substring(i + 1); + head = word.substring(0, i); + middle = word.substring(i, i + 1); + tail = word.substring(i + 1); + // |p]en -> [P]en, p|e]n -> p[E]n, pe|n] -> pe[N]. for (char c : ALPHABET) { - // substitution - ListOfWords.add(head + c + tail); + // substitution of "middle" with "c" + push.call(head + c + tail, middle, "" + c); } - // deletion of a single character (prevent adding empty words) + // deletion of a single character after the current cursor. + // Requires at least two characters in the word. + // |pe]n -> [e]n, p|en] -> p[n], p[e|n] -> p[e]. if (word.length() > 1) { - ListOfWords.add(head + tail); + if (i + 1 < word.length()) { + error = word.substring(i, i + 2); + } else { + error = word.substring(i - 1, i + 1); + } + push.call(head + tail, error, middle); } } @@ -78,15 +110,17 @@ public class SpellCorrector { // account for spelling mistakes. This loop transposes the previous // character with the current index. for (int i = 1; i < word.length(); i++) { - // the word is split into [0..i-1] [i-1..i] [i..i+1] [i+1..n] - // if i == word.length() - 1, then the tail is empty. - String head = word.substring(0, i - 1); - char c1 = word.charAt(i - 1); - char c2 = word.charAt(i); - String tail = i + 1 < word.length() ? word.substring(i + 1) : ""; + String head, middle, transposed, tail; + // the word is split into [0..i-1] [i-1..i+1] [i+1..n] + head = word.substring(0, i - 1); + middle = word.substring(i - 1, i + 1); + tail = word.substring(i + 1); + + transposed = "" + middle.charAt(1) + middle.charAt(0); // transposition of two adjacent letters - ListOfWords.add(head + c2 + c1 + tail); + // [p|e]n -> [ep]n, p[e|n] -> p[ne]. + push.call(head + transposed + tail, middle, transposed); } return cr.inVocabulary(ListOfWords); diff --git a/spellchecker/src/TriFunction.java b/spellchecker/src/TriFunction.java new file mode 100644 index 0000000..327e670 --- /dev/null +++ b/spellchecker/src/TriFunction.java @@ -0,0 +1,13 @@ + +/** + * An interface for a lambda function accepting three different parameters. + * + * @author Peter Wu + * @param + * @param + * @param + */ +interface TriFunction { + + public void call(A a, B b, C c); +} diff --git a/spellchecker/test/SpellCorrectorTest.java b/spellchecker/test/SpellCorrectorTest.java index 1438b04..a00e236 100644 --- a/spellchecker/test/SpellCorrectorTest.java +++ b/spellchecker/test/SpellCorrectorTest.java @@ -115,6 +115,11 @@ public class SpellCorrectorTest { super(); } + @Override + public boolean inVocabulary(String word) { + return true; + } + @Override public HashSet inVocabulary(Set set) { return new HashSet<>(set); -- cgit v1.2.1