From 4615358ae4f3aab0e18bb0b290e7c0caaa7441ea Mon Sep 17 00:00:00 2001 From: Peter Wu Date: Thu, 2 Apr 2015 01:52:55 +0200 Subject: Fix probability calculations Replace monkey patching by an actual implementation that correctly combines channel probability (language model), prior and interpolation. --- spellchecker/src/CorpusReader.java | 21 +++++-------- spellchecker/src/SpellCorrector.java | 60 ++++++++++++++++++++++-------------- 2 files changed, 45 insertions(+), 36 deletions(-) diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java index f9df1ca..8281210 100644 --- a/spellchecker/src/CorpusReader.java +++ b/spellchecker/src/CorpusReader.java @@ -115,19 +115,14 @@ public class CorpusReader { double smoothedCount = getNGramCount(NGram); - int n_words = NGram.split(" ").length + 1; - switch (n_words) { - case 1: // unigram - smoothedCount += 1.0; - break; - case 2: // bigram - smoothedCount += 2.0; - break; - case 3: // trigram - smoothedCount += 4.0; - break; - default: - throw new AssertionError("Unknown n-gram with n=" + n_words); + // The caller invokes func(bigram) / func(unigram) and expects a + // probability as result. + if (NGram.indexOf(' ') != -1) { + // bigram, must be the nominator + smoothedCount += 1; + } else { + // unigram, must be the denominator + smoothedCount += 1; } return smoothedCount; diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java index 64a35e4..ab41c3d 100644 --- a/spellchecker/src/SpellCorrector.java +++ b/spellchecker/src/SpellCorrector.java @@ -9,6 +9,16 @@ public class SpellCorrector { final static char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz'".toCharArray(); + /** + * Lambda values for interpolation of n-gram probabilities. The first value + * is for unigrams, the second for bigrams, etc. + */ + private final static double[] LAMBDAS = new double[]{.5, .5}; + /** + * The language model probability for uncorrected words. + */ + private final static double LM_PROBABILITY_UNMODIFIED = .95; + public SpellCorrector(CorpusReader cr, ConfusionMatrixReader cmr) { this.cr = cr; this.cmr = cmr; @@ -74,6 +84,7 @@ public class SpellCorrector { // add-one smoothing p_channel = (correctionCount + 1) / (errorCount + 1); + // TODO: take the max instead of addition? // Sum the probabilities as independent modifications can result in // the same word ("acess" -> "access" by "a|ac", "e|ce"). double p = candidates.getOrDefault(word2, 0.0); @@ -194,7 +205,8 @@ public class SpellCorrector { this.word_likelihoods = new double[words.length]; this.words_readonly = new boolean[words.length]; for (int i = 0; i < words.length; i++) { - word_likelihoods[i] = getWordLikelihood(i, words[i], 0.5); + word_likelihoods[i] = getWordLikelihood(i, words[i], + LM_PROBABILITY_UNMODIFIED); } } @@ -204,37 +216,38 @@ public class SpellCorrector { */ public double getWordLikelihood(int index, String word, double channel_probability) { - String old_word = words[index]; - double likelihood; + String prev_word, ngram; + double prior, score, p; // a suggested word not in the vocabulary is certainly wrong, // changed (or consequentive) words should also not be changed. if (!cr.inVocabulary(word) || words_readonly[index]) { return 0.0; } - // Find prior - likelihood = (cr.getNGramCount(words[index]) + .5) / cr.getUnigramCount(); - // Add probability of bi-grams. - if (index > 0) { - String str; - str = words[index - 1] + " " + words[index]; - likelihood *= (cr.getNGramCount(str) + .5) / cr.getNGramCount(words[index - 1]); - } + assert channel_probability > 0.0; - if (words[index].equals(word)) { - // TODO: tune and magic number - likelihood *= .80; - } else { - likelihood *= channel_probability; - } + // P(x|w) is given by language model (noisy channel probability). + // Find prior P(w) = (freq(w) + .5) / N. + // Then compute candidate score by P(w) * P(x|w) + prior = (cr.getNGramCount(word) + .5) / cr.getUnigramCount(); + score = prior * channel_probability; - // old word is unknown, let's make sure to change it first. - if (!cr.inVocabulary(old_word)) { - likelihood += 10; + // Now obtain n-gram probabilities. Use interpolation to combine + // unigrams and bigrams. + p = LAMBDAS[0] * cr.getSmoothedCount(word); + + // Add probability of bi-grams. + // For words u and w, P(w|u) = P(u, w) / P(u). + if (index > 0) { + prev_word = words[index - 1]; + ngram = prev_word + " " + word; + p += LAMBDAS[1] * cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word); } - assert likelihood > 0.0 : "failed probability for " + word; - return likelihood; + // Combine the candidate score with the n-gram probabilities. + p *= score; + assert p > 0.0 : "failed probability for " + word; + return p; } /** @@ -290,7 +303,8 @@ public class SpellCorrector { } best_modification = null; best_likelihood = Double.MIN_VALUE; - word_likelihoods[index] = getWordLikelihood(index, word, 0.5); + word_likelihoods[index] = getWordLikelihood(index, word, + LM_PROBABILITY_UNMODIFIED); } private class WordModification { -- cgit v1.2.1