import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; public class SpellCorrector { final private CorpusReader cr; final private ConfusionMatrixReader cmr; final static char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz'".toCharArray(); /** * The highest n-gram to look for. 1 gives a unigram, 2 gives a bigram, etc. */ private final static int NGRAM_N = 2; /** * The language model probability for uncorrected words. */ private final static double LM_PROBABILITY_UNMODIFIED = .95; /** * The maximum number of misspelled words to look for. */ private final static int MAX_TYPOS = 2; private final boolean DEBUG_SCORE = System.getenv("DEBUG_SCORE") != null; public SpellCorrector(CorpusReader cr, ConfusionMatrixReader cmr) { this.cr = cr; this.cmr = cmr; } private void debugPrint(String str) { // print debugging information if NO_PEACH is set. if (System.getenv("NO_PEACH") != null) { System.err.println(str); } } /** * Converts a set of word indices which were modified to an array of length * {@code n} of which elements are set to true when those should not be * modified. */ private boolean[] getReadonlyIndices(int n, Set modifiedIndices) { boolean[] words_readonly = new boolean[n]; for (Integer index : modifiedIndices) { if (index > 0) { words_readonly[index - 1] = true; } words_readonly[index] = true; if (index + 1 < n) { words_readonly[index + 1] = true; } } return words_readonly; } /** * Tries to find up to {@code maxTypos} number of misspelled words. * * @return The resulting rater containing probabilities and the corrected * sentence or null if no suggestions are found. */ private SentenceRater findBetterWord(String[] words, Set modifiedIndices) { SentenceRater bestResult = null; // find best word for (int i = 0; i < words.length; i++) { final int word_index = i; String old_word = words[word_index]; SentenceRater rater = new SentenceRater(words, getReadonlyIndices(words.length, modifiedIndices)); // try to find a better suggestion for this word. Map candidates = getCandidateWords(old_word); debugPrint("\n\n LOOKING AT " + i + " " + old_word); candidates.forEach((word, channel_probability) -> { rater.tryWord(word_index, word, channel_probability); }); // if a better word was found, use the change if (rater.hasBetterSuggestion()) { rater.saveSuggestion(); bestResult = rater; // If some other errors are still possible, hunt for those! modifiedIndices.add(i); if (modifiedIndices.size() < MAX_TYPOS) { SentenceRater subResult; subResult = findBetterWord(rater.getBestSentence(), modifiedIndices); // Did it find a corrected word? if (subResult != null) { // Yes it did. Is this the best one so far? if (subResult.getBestScore() > rater.getBestScore()) { debugPrint("Subresult is better!"); bestResult = subResult; } else { debugPrint("Subresult is not better!"); } } else { debugPrint("no subresult found"); } } // make the context to be editable again modifiedIndices.remove(i); } else { debugPrint("No suggestion found for " + old_word + "."); } } return bestResult; } public String correctPhrase(String phrase) { if (phrase == null || phrase.length() == 0) { throw new IllegalArgumentException("phrase must be non-empty."); } String[] words = phrase.split(" "); SentenceRater rater = findBetterWord(words, new HashSet<>()); // if a better sentence is found, use it. if (rater != null) { words = rater.getBestSentence(); } return String.join(" ", words); } /** * Gets all candidate words, resulting from a single insertion, deletion. * substitution or transposition. * * @param word The (wrong) word to find (corrected) candidates for. * @return Candidate words mapping to the noisy channel model probability. */ public Map getCandidateWords(String word) { Map candidates = new HashMap<>(); // tries to add word2 to the list of words. This word2 was generated // from "word" by changing "error" to "correct". TriFunction push = (word2, error, correct) -> { if (!cr.inVocabulary(word2) || word.equals(word2)) { return; } // Find the channel model probability (probability of the edit). // P(x|w) = "corrections count given error" / "errors count" double correctionCount, errorCount, p_channel; correctionCount = (double) cmr.getConfusionCount(error, correct); errorCount = cmr.getErrorsCount(error); // add-one smoothing p_channel = (correctionCount + 1) / (errorCount + 1); // while we could sum here, it does not make sense for the // probability. Use the probability of the most likely change type. double p = candidates.getOrDefault(word2, 0.0); p = Math.max(p, p_channel); candidates.put(word2, p); }; makeWordInsertion(word, push); makeWordSubstitution(word, push); makeWordDeletion(word, push); makeWordTransposition(word, push); return candidates; } private void makeWordInsertion(String word, TriFunction push) { // Generate words by insertion of a character // |p]en -> [ap]en, [p|en -> [pi]en, p[e|n -> p[ei]n, pe[n| -> pe[nm]. for (int i = 0; i <= word.length(); i++) { String head, head_last, tail; // the word is split into [0..i] [i..n] head = word.substring(0, i); tail = word.substring(i); for (char c : ALPHABET) { if (i == 0) { // insert "c" before current letter (i = 0) head_last = word.substring(0); push.call(head + c + tail, head_last, c + head_last); } else { // insert "c" after current letter (i > 0) head_last = word.substring(i - 1, i); push.call(head + c + tail, head_last, head_last + c); } } } } private void makeWordSubstitution(String word, TriFunction push) { // Generate words by substitution of a character. // |p]en -> [P]en, p|e]n -> p[E]n, pe|n] -> pe[N]. for (int i = 0; i < word.length(); i++) { String head, middle, tail; // the word is split into [0..i] [i..i+1] [i+1..n] head = word.substring(0, i); middle = word.substring(i, i + 1); tail = word.substring(i + 1); for (char c : ALPHABET) { // substitution of "middle" with "c" push.call(head + c + tail, middle, "" + c); } } } private void makeWordDeletion(String word, TriFunction push) { // Generate words by deletion of a character. Requires at least two // characters in the word (an empty string is invalid). // |pe]in -> [e]in, p|ei]n -> p[i]n, pe|in] -> pe[n], pe[i|n] -> pe[i]. if (word.length() > 1) { for (int i = 0; i < word.length(); i++) { String head, middle, error, tail; // the word is split into [0..i] [i..i+1] [i+1..n] head = word.substring(0, i); middle = word.substring(i, i + 1); tail = word.substring(i + 1); if (i + 1 < word.length()) { // Common case: deletion of the following character. // p|ei]n -> p[i]n error = word.substring(i, i + 2); } else { // Last case: operate on the previous and next characters. // pe[i|n] -> pe[i] error = word.substring(i - 1, i + 1); } push.call(head + tail, error, middle); } } } private void makeWordTransposition(String word, TriFunction push) { // The Damerau-Levenshtein distance also includes transposition of two // adjacent letters to account for spelling mistakes. // [p|e]n -> [ep]n, p[e|n] -> p[ne]. for (int i = 1; i < word.length(); i++) { String head, middle, transposed, tail; // the word is split into [0..i-1] [i-1..i+1] [i+1..n] head = word.substring(0, i - 1); middle = word.substring(i - 1, i + 1); tail = word.substring(i + 1); transposed = "" + middle.charAt(1) + middle.charAt(0); push.call(head + transposed + tail, middle, transposed); } } private class SentenceRater { private final String[] words; private final double[] word_likelihoods; /** * Words that cannot be modified in further iterations. */ private final boolean[] words_readonly; private double sentence_probability; private WordModification best_modification; public SentenceRater(String[] words, boolean[] word_readonly) { this.words = words.clone(); this.word_likelihoods = new double[words.length]; this.words_readonly = word_readonly; for (int i = 0; i < words.length; i++) { word_likelihoods[i] = getWordLikelihood(i, LM_PROBABILITY_UNMODIFIED); } sentence_probability = combineProbabilities(word_likelihoods); if (DEBUG_SCORE) { debugScore(); } } /** * Calculates the probability of a sentence as a whole. */ private double combineProbabilities(double[] probabilities) { double p = 1; for (double score : probabilities) { if (score == 0) { // Non-existing words are really bad. p *= 1e-99; continue; } p *= score; } return p; } /** * Calculates the probability that the word {@code word} is valid at * position {@code index}. */ public double getWordLikelihood(int index, double channel_probability) { double prior, p, igram_p; String word = words[index]; // a suggested word not in the vocabulary is certainly wrong if (!cr.inVocabulary(word)) { return 0.0; } assert channel_probability > 0.0; String debug_word = null; if (DEBUG_SCORE) { debug_word = ""; } // compute unigram component of language model: P(w) igram_p = cr.getNgramProbability(word, ""); prior = igram_p; if (debug_word != null) { debug_word += " 1p=" + igram_p; } // compute bigrams (P(w|prefix)), etc. String ngram = ""; for (int i = 1; i < NGRAM_N; i++) { // are there actually enough words to compute this metric? if (index - i >= 0) { // increase ngram prefix if (ngram.isEmpty()) { ngram = words[index - i]; } else { ngram = words[index - i] + " " + ngram; } // Obtain n-gram probs and combine using interpolation. igram_p = cr.getNgramProbability(word, ngram); } else { // no metrics found, cannot deduce much information from it igram_p = .5; } prior *= igram_p; if (debug_word != null) { debug_word += " " + (i + 1) + "p=" + igram_p; } } // Finally combine probabilities using the Noisy Channel Model. // P(x|w) is given by language model (noisy channel probability). // Here the prior is a combination of ngram probabilities. // The candidate score is finally computed by P(w) * P(x|w) p = prior * channel_probability; if (debug_word != null) { debugPrint("# " + word + " p=" + p + " chan=" + channel_probability + " prior=" + prior + debug_word); } assert p > 0.0 : "failed probability for " + word; return p; } /** * Tries to modify word at position {@code index} to word and calculates * the likelihood of the word. The best single-word modification is * remembered (the sentence itself will not be modified). */ public void tryWord(int index, String word, double channel_probability) { double score, p; String old_word = words[index]; double[] scores; int index_left, index_right; // Words that have previously been changed (or conseqentive words) // should not be changed again. if (words_readonly[index]) { return; } // Simulate the change of changing this word. words[index] = word; // As changing the word itself can affect the ngram rating of // the context, recalculate the probabilities for those too. index_left = Math.max(0, index - NGRAM_N + 1); index_right = Math.min(words.length, index + NGRAM_N - 1); scores = word_likelihoods.clone(); // calculate probabilities for each word that is possible affected // by this change. for (int i = 0; i < words.length; i++) { if (i < index_left || i > index_right) { // the probability is unchanged, ignore. continue; } if (i == index) { // the word that is being modified score = getWordLikelihood(i, channel_probability); } else { // the word around the modified word score = getWordLikelihood(i, LM_PROBABILITY_UNMODIFIED); } scores[i] = score; } // restore word words[index] = old_word; // group the effects of this modifications for tracking. WordModification effect = new WordModification(index, word, scores); // if there was a previous best modification, the proposed // modification must be better than that. Otherwise, it must be // better than the original sentence. if ((best_modification != null && effect.probability > best_modification.probability) || (best_modification == null && effect.probability > sentence_probability)) { debugPrint("Found better word! " + word + " p=" + effect.probability); best_modification = effect; } } /** * Returns the sentence, possibly changed. */ public String[] getBestSentence() { String[] new_words = words.clone(); assert best_modification == null : "Call saveSuggestion() first"; return new_words; } /** * Returns the score of the current accepted sentence. */ public double getBestScore() { assert best_modification == null : "Call saveSuggestion() first"; return sentence_probability; } /** * Returns true if it is likely that a word in the sentence can be * corrected. */ public boolean hasBetterSuggestion() { return best_modification != null; } private void debugScore() { debugScore(-1, null, 0, 0); } private void debugScore(int index, String old_word, double old_score, double old_evaluation) { System.err.println(); if (index >= 0) { System.err.println("Word: " + old_word + " -> " + words[index]); System.err.println("Word score : " + old_score + " -> " + word_likelihoods[index] + " (" + (word_likelihoods[index] - old_score) + ")"); System.err.println("Phrase evaluation: " + old_evaluation + " -> " + sentence_probability + " (" + (sentence_probability - old_evaluation) + ")"); } else { System.err.println("Phrase evaluation: " + sentence_probability); } for (int i = 0; i < words.length; i++) { System.err.println(String.format("%28s %s", words[i], word_likelihoods[i])); } System.err.println(); } /** * Save the best suggestion so far, ensuring that future suggestions * won't overwrite this one. */ public void saveSuggestion() { int index = best_modification.index; String word = best_modification.word; double[] scores = best_modification.scores; // for debugging String old_word = words[index]; double old_score = word_likelihoods[index]; double old_evaluation = sentence_probability; // save the word and the affected scores words[index] = word; System.arraycopy(word_likelihoods, 0, scores, 0, words.length); sentence_probability = best_modification.probability; if (DEBUG_SCORE) { debugScore(index, old_word, old_score, old_evaluation); } // if this was the best word, do not change it now. The context // should also not change. if (index > 0) { words_readonly[index - 1] = true; } words_readonly[index] = true; if (index + 1 < words.length) { words_readonly[index + 1] = true; } // change was applied, forget suggestion. best_modification = null; } private class WordModification { private final int index; private final String word; private final double[] scores; private final double probability; public WordModification(int index, String word, double[] scores) { this.index = index; this.word = word; this.scores = scores; this.probability = combineProbabilities(scores); } } } }