import java.util.HashMap; import java.util.Map; public class SpellCorrector { final private CorpusReader cr; final private ConfusionMatrixReader cmr; final static char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz'".toCharArray(); public SpellCorrector(CorpusReader cr, ConfusionMatrixReader cmr) { this.cr = cr; this.cmr = cmr; } public String correctPhrase(String phrase) { if (phrase == null || phrase.length() == 0) { throw new IllegalArgumentException("phrase must be non-empty."); } String[] words = phrase.split(" "); String finalSuggestion = ""; /** * CODE TO BE ADDED * */ return finalSuggestion.trim(); } /** * Gets all candidate words, resulting from a single insertion, deletion. * substitution or transposition. * * @param word The (wrong) word to find (corrected) candidates for. * @return Candidate words mapping to the noisy channel model probability. */ public Map getCandidateWords(String word) { Map candidates = new HashMap<>(); // tries to add word2 to the list of words. This word2 was generated // from "word" by changing "error" to "correct". TriFunction push = (word2, error, correct) -> { if (!cr.inVocabulary(word2)) { return; } // Find the channel model probability (probability of the edit). // P(x|w) = "corrections count given error" / "errors count" double correctionCount, errorCount, p_channel; correctionCount = (double) cmr.getConfusionCount(error, correct); errorCount = cmr.getErrorsCount(error); // is this a known correction? if (errorCount == 0.0) { // no, p_channel = 0.0; } else { // yes, p_channel = correctionCount / errorCount; } // Sum the probabilities as independent modifications can result in // the same word ("acess" -> "access" by "a|ac", "e|ce"). double p = candidates.getOrDefault(word2, 0.0); p += p_channel; candidates.put(word2, p); }; makeWordInsertion(word, push); makeWordSubstitution(word, push); makeWordDeletion(word, push); makeWordTransposition(word, push); return candidates; } private void makeWordInsertion(String word, TriFunction push) { // Generate words by insertion of a character // |p]en -> [ap]en, [p|en -> [pi]en, p[e|n -> p[ei]n, pe[n| -> pe[nm]. for (int i = 0; i <= word.length(); i++) { String head, head_last, tail; // the word is split into [0..i] [i..n] head = word.substring(0, i); tail = word.substring(i); for (char c : ALPHABET) { if (i == 0) { // insert "c" before current letter (i = 0) head_last = word.substring(0); push.call(head + c + tail, head_last, c + head_last); } else { // insert "c" after current letter (i > 0) head_last = word.substring(i - 1, i); push.call(head + c + tail, head_last, head_last + c); } } } } private void makeWordSubstitution(String word, TriFunction push) { // Generate words by substitution of a character. // |p]en -> [P]en, p|e]n -> p[E]n, pe|n] -> pe[N]. for (int i = 0; i < word.length(); i++) { String head, middle, tail; // the word is split into [0..i] [i..i+1] [i+1..n] head = word.substring(0, i); middle = word.substring(i, i + 1); tail = word.substring(i + 1); for (char c : ALPHABET) { // substitution of "middle" with "c" push.call(head + c + tail, middle, "" + c); } } } private void makeWordDeletion(String word, TriFunction push) { // Generare words by deletion of a character. Requires at least two // characters in the word (an empty string is invalid). // |pe]in -> [e]in, p|ei]n -> p[i]n, pe|in] -> pe[n], pe[i|n] -> pe[i]. if (word.length() > 1) { for (int i = 0; i < word.length(); i++) { String head, middle, error, tail; // the word is split into [0..i] [i..i+1] [i+1..n] head = word.substring(0, i); middle = word.substring(i, i + 1); tail = word.substring(i + 1); if (i + 1 < word.length()) { // Common case: deletion of the following character. // p|ei]n -> p[i]n error = word.substring(i, i + 2); } else { // Last case: operate on the previous and next characters. // pe[i|n] -> pe[i] error = word.substring(i - 1, i + 1); } push.call(head + tail, error, middle); } } } private void makeWordTransposition(String word, TriFunction push) { // The Damerau-Levenshtein distance also includes transposition of two // adjacent letters to account for spelling mistakes. // [p|e]n -> [ep]n, p[e|n] -> p[ne]. for (int i = 1; i < word.length(); i++) { String head, middle, transposed, tail; // the word is split into [0..i-1] [i-1..i+1] [i+1..n] head = word.substring(0, i - 1); middle = word.substring(i - 1, i + 1); tail = word.substring(i + 1); transposed = "" + middle.charAt(1) + middle.charAt(0); push.call(head + transposed + tail, middle, transposed); } } }