import java.util.HashMap; import java.util.Map; public class SpellCorrector { final private CorpusReader cr; final private ConfusionMatrixReader cmr; final static char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz'".toCharArray(); /** * Lambda values for interpolation of n-gram probabilities. The first value * is for unigrams, the second for bigrams, etc. */ private final static double[] LAMBDAS = new double[]{.5, .5}; /** * The language model probability for uncorrected words. */ private final static double LM_PROBABILITY_UNMODIFIED = .95; public SpellCorrector(CorpusReader cr, ConfusionMatrixReader cmr) { this.cr = cr; this.cmr = cmr; } public String correctPhrase(String phrase) { if (phrase == null || phrase.length() == 0) { throw new IllegalArgumentException("phrase must be non-empty."); } String[] words = phrase.split(" "); SentenceRater rater = new SentenceRater(words); // Try to change 1 word and see whether the suggestions are OK. for (int attempt = 0; attempt < 3; attempt++) { for (int i = 0; i < words.length; i++) { final int word_index = i; String old_word = words[word_index]; // try to find a better suggestion for this word. Map candidates = getCandidateWords(old_word); candidates.forEach((word, channel_probability) -> { rater.tryWord(word_index, word, channel_probability); }); }; if (rater.hasBetterSuggestion()) { rater.saveSuggestion(); // TODO: make this nicer words = rater.getBestSentence(); System.err.println("Got new sentence: " + String.join(" ", words)); } else { System.err.println("No suggestion found."); break; } } return String.join(" ", words); } /** * Gets all candidate words, resulting from a single insertion, deletion. * substitution or transposition. * * @param word The (wrong) word to find (corrected) candidates for. * @return Candidate words mapping to the noisy channel model probability. */ public Map getCandidateWords(String word) { Map candidates = new HashMap<>(); // tries to add word2 to the list of words. This word2 was generated // from "word" by changing "error" to "correct". TriFunction push = (word2, error, correct) -> { if (!cr.inVocabulary(word2) || word.equals(word2)) { return; } // Find the channel model probability (probability of the edit). // P(x|w) = "corrections count given error" / "errors count" double correctionCount, errorCount, p_channel; correctionCount = (double) cmr.getConfusionCount(error, correct); errorCount = cmr.getErrorsCount(error); // add-one smoothing p_channel = (correctionCount + 1) / (errorCount + 1); // TODO: take the max instead of addition? // Sum the probabilities as independent modifications can result in // the same word ("acess" -> "access" by "a|ac", "e|ce"). double p = candidates.getOrDefault(word2, 0.0); p += p_channel; candidates.put(word2, p); }; makeWordInsertion(word, push); makeWordSubstitution(word, push); makeWordDeletion(word, push); makeWordTransposition(word, push); return candidates; } private void makeWordInsertion(String word, TriFunction push) { // Generate words by insertion of a character // |p]en -> [ap]en, [p|en -> [pi]en, p[e|n -> p[ei]n, pe[n| -> pe[nm]. for (int i = 0; i <= word.length(); i++) { String head, head_last, tail; // the word is split into [0..i] [i..n] head = word.substring(0, i); tail = word.substring(i); for (char c : ALPHABET) { if (i == 0) { // insert "c" before current letter (i = 0) head_last = word.substring(0); push.call(head + c + tail, head_last, c + head_last); } else { // insert "c" after current letter (i > 0) head_last = word.substring(i - 1, i); push.call(head + c + tail, head_last, head_last + c); } } } } private void makeWordSubstitution(String word, TriFunction push) { // Generate words by substitution of a character. // |p]en -> [P]en, p|e]n -> p[E]n, pe|n] -> pe[N]. for (int i = 0; i < word.length(); i++) { String head, middle, tail; // the word is split into [0..i] [i..i+1] [i+1..n] head = word.substring(0, i); middle = word.substring(i, i + 1); tail = word.substring(i + 1); for (char c : ALPHABET) { // substitution of "middle" with "c" push.call(head + c + tail, middle, "" + c); } } } private void makeWordDeletion(String word, TriFunction push) { // Generate words by deletion of a character. Requires at least two // characters in the word (an empty string is invalid). // |pe]in -> [e]in, p|ei]n -> p[i]n, pe|in] -> pe[n], pe[i|n] -> pe[i]. if (word.length() > 1) { for (int i = 0; i < word.length(); i++) { String head, middle, error, tail; // the word is split into [0..i] [i..i+1] [i+1..n] head = word.substring(0, i); middle = word.substring(i, i + 1); tail = word.substring(i + 1); if (i + 1 < word.length()) { // Common case: deletion of the following character. // p|ei]n -> p[i]n error = word.substring(i, i + 2); } else { // Last case: operate on the previous and next characters. // pe[i|n] -> pe[i] error = word.substring(i - 1, i + 1); } push.call(head + tail, error, middle); } } } private void makeWordTransposition(String word, TriFunction push) { // The Damerau-Levenshtein distance also includes transposition of two // adjacent letters to account for spelling mistakes. // [p|e]n -> [ep]n, p[e|n] -> p[ne]. for (int i = 1; i < word.length(); i++) { String head, middle, transposed, tail; // the word is split into [0..i-1] [i-1..i+1] [i+1..n] head = word.substring(0, i - 1); middle = word.substring(i - 1, i + 1); tail = word.substring(i + 1); transposed = "" + middle.charAt(1) + middle.charAt(0); push.call(head + transposed + tail, middle, transposed); } } private class SentenceRater { private final String[] words; private final double[] word_likelihoods; /** * Words that cannot be modified in further iterations. */ private final boolean[] words_readonly; private double best_likelihood = Double.MIN_VALUE; private WordModification best_modification; public SentenceRater(String[] words) { this.words = words.clone(); this.word_likelihoods = new double[words.length]; this.words_readonly = new boolean[words.length]; for (int i = 0; i < words.length; i++) { word_likelihoods[i] = getWordLikelihood(i, words[i], LM_PROBABILITY_UNMODIFIED); } } /** * Calculates the probability that the word {@code word} is valid at * position {@code index}. */ public double getWordLikelihood(int index, String word, double channel_probability) { String prev_word, ngram; double prior, score, p; // a suggested word not in the vocabulary is certainly wrong, // changed (or consequentive) words should also not be changed. if (!cr.inVocabulary(word) || words_readonly[index]) { return 0.0; } assert channel_probability > 0.0; // P(x|w) is given by language model (noisy channel probability). // Find prior P(w) = (freq(w) + .5) / N. // Then compute candidate score by P(w) * P(x|w) prior = (cr.getNGramCount(word) + .5) / cr.getUnigramCount(); score = prior * channel_probability; // Now obtain n-gram probabilities. Use interpolation to combine // unigrams and bigrams. p = LAMBDAS[0] * cr.getSmoothedCount(word) / cr.getUnigramCount(); // Add probability of bi-grams. // For words u and w, P(w|u) = P(u, w) / P(u). if (index > 0) { prev_word = words[index - 1]; ngram = prev_word + " " + word; p += LAMBDAS[1] * cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word); } // Combine the candidate score with the n-gram probabilities. p *= score; assert p > 0.0 : "failed probability for " + word; return p; } /** * Tries to modify word at position {@code index} to word and calculates * the likelihood of the word. The best single-word modification is * remembered (the sentence itself will not be modified). */ public void tryWord(int index, String word, double channel_probability) { double likelihood; // try the modification, calculate the result and restore. likelihood = getWordLikelihood(index, word, channel_probability); // look for the word which increases the likelihood the most // (that is, the difference of the old and new likelihood). likelihood -= word_likelihoods[index]; if (likelihood > best_likelihood) { best_likelihood = likelihood; best_modification = new WordModification(index, word); } } /** * Returns the sentence, possibly changed. */ public String[] getBestSentence() { String[] new_words = words.clone(); if (best_modification != null) { new_words[best_modification.index] = best_modification.word; } return new_words; } /** * Returns true if it is likely that a word in the sentence can be * corrected. */ public boolean hasBetterSuggestion() { return best_modification != null; } /** * Save the best suggestion so far, ensuring that future suggestions * won't overwrite this one. */ public void saveSuggestion() { int index = best_modification.index; String word = best_modification.word; words[index] = word; if (index > 0) { words_readonly[index - 1] = true; } words_readonly[index] = true; if (index + 1 < words.length) { words_readonly[index + 1] = true; } best_modification = null; best_likelihood = Double.MIN_VALUE; word_likelihoods[index] = getWordLikelihood(index, word, LM_PROBABILITY_UNMODIFIED); } private class WordModification { private final int index; private final String word; public WordModification(int index, String word) { this.index = index; this.word = word; } } } }