From ce91b395fd0139f7e9faba09f5294e6abfaec591 Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Wed, 1 Apr 2015 09:22:16 +0200 Subject: SpellChecker: FUBARED This is the submitted version, it contains random attempts and a fix (actually not that sure that this is the submitted version...). The Double.MIN_VALUE fix is probably missing. --- spellchecker/src/SpellCorrector.java | 65 +++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index 179206f..64a35e4 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -71,14 +71,8 @@ public class SpellCorrector { correctionCount = (double) cmr.getConfusionCount(error, correct); errorCount = cmr.getErrorsCount(error); - // is this a known correction? - if (errorCount == 0.0) { - // no, - p_channel = 0.1; - } else { - // yes, - p_channel = correctionCount / errorCount; - } + // add-one smoothing + p_channel = (correctionCount + 1) / (errorCount + 1); // Sum the probabilities as independent modifications can result in // the same word ("acess" -> "access" by "a|ac", "e|ce"). @@ -140,7 +134,7 @@ public class SpellCorrector { private void makeWordDeletion(String word, TriFunction push) { - // Generare words by deletion of a character. Requires at least two + // Generate words by deletion of a character. Requires at least two // characters in the word (an empty string is invalid). // |pe]in -> [e]in, p|ei]n -> p[i]n, pe|in] -> pe[n], pe[i|n] -> pe[i]. if (word.length() > 1) { @@ -190,7 +184,7 @@ public class SpellCorrector { /** * Words that cannot be modified in further iterations. */ - private final boolean[] words_locked; + private final boolean[] words_readonly; private double best_likelihood = Double.MIN_VALUE; private WordModification best_modification; @@ -198,9 +192,9 @@ public class SpellCorrector { public SentenceRater(String[] words) { this.words = words.clone(); this.word_likelihoods = new double[words.length]; - this.words_locked = new boolean[words.length]; + this.words_readonly = new boolean[words.length]; for (int i = 0; i < words.length; i++) { - word_likelihoods[i] = getWordProbability(i, words[i], .5); + word_likelihoods[i] = getWordLikelihood(i, words[i], 0.5); } } @@ -208,32 +202,39 @@ public class SpellCorrector { * Calculates the probability that the word {@code word} is valid at * position {@code index}. */ - public double getWordProbability(int index, String word, + public double getWordLikelihood(int index, String word, double channel_probability) { - double probability; - // a word not in the vocabulary is certainly wrong, + String old_word = words[index]; + double likelihood; + // a suggested word not in the vocabulary is certainly wrong, // changed (or consequentive) words should also not be changed. - if (!cr.inVocabulary(word) || words_locked[index]) { + if (!cr.inVocabulary(word) || words_readonly[index]) { return 0.0; } - // Calculate probability of unigram. - double p = cr.getSmoothedCount(words[index]); - p *= channel_probability; + // Find prior + likelihood = (cr.getNGramCount(words[index]) + .5) / cr.getUnigramCount(); // Add probability of bi-grams. if (index > 0) { - // TODO bigrams + String str; + str = words[index - 1] + " " + words[index]; + likelihood *= (cr.getNGramCount(str) + .5) / cr.getNGramCount(words[index - 1]); } - probability = p; if (words[index].equals(word)) { // TODO: tune and magic number - probability *= .80; + likelihood *= .80; } else { - probability *= channel_probability; + likelihood *= channel_probability; } - // TODO: what if this is 0? - return probability; + + // old word is unknown, let's make sure to change it first. + if (!cr.inVocabulary(old_word)) { + likelihood += 10; + } + + assert likelihood > 0.0 : "failed probability for " + word; + return likelihood; } /** @@ -245,7 +246,7 @@ public class SpellCorrector { double likelihood; // try the modification, calculate the result and restore. - likelihood = getWordProbability(index, word, channel_probability); + likelihood = getWordLikelihood(index, word, channel_probability); if (likelihood > best_likelihood) { best_likelihood = likelihood; @@ -278,16 +279,18 @@ public class SpellCorrector { */ public void saveSuggestion() { int index = best_modification.index; - words[index] = best_modification.word; + String word = best_modification.word; + words[index] = word; if (index > 0) { - words_locked[index - 1] = true; + words_readonly[index - 1] = true; } - words_locked[index] = true; + words_readonly[index] = true; if (index + 1 < words.length) { - words_locked[index + 1] = true; + words_readonly[index + 1] = true; } best_modification = null; - // TODO: save likelihood + best_likelihood = Double.MIN_VALUE; + word_likelihoods[index] = getWordLikelihood(index, word, 0.5); } private class WordModification { -- cgit v1.2.1