From 4615358ae4f3aab0e18bb0b290e7c0caaa7441ea Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Thu, 2 Apr 2015 01:52:55 +0200
Subject: Fix probability calculations

Replace monkey patching by an actual implementation that correctly
combines channel probability (language model), prior and interpolation.
---
 spellchecker/src/CorpusReader.java   | 21 +++++--------
 spellchecker/src/SpellCorrector.java | 60 ++++++++++++++++++++++--------------
 2 files changed, 45 insertions(+), 36 deletions(-)

diff --git a/spellchecker/src/CorpusReader.java b/spellchecker/src/CorpusReader.java
index f9df1ca..8281210 100644
--- a/spellchecker/src/CorpusReader.java
+++ b/spellchecker/src/CorpusReader.java
@@ -115,19 +115,14 @@ public class CorpusReader {
 
         double smoothedCount = getNGramCount(NGram);
 
-        int n_words = NGram.split(" ").length + 1;
-        switch (n_words) {
-            case 1: // unigram
-                smoothedCount += 1.0;
-                break;
-            case 2: // bigram
-                smoothedCount += 2.0;
-                break;
-            case 3: // trigram
-                smoothedCount += 4.0;
-                break;
-            default:
-                throw new AssertionError("Unknown n-gram with n=" + n_words);
+        // The caller invokes func(bigram) / func(unigram) and expects a
+        // probability as result.
+        if (NGram.indexOf(' ') != -1) {
+            // bigram, must be the nominator
+            smoothedCount += 1;
+        } else {
+            // unigram, must be the denominator
+            smoothedCount += 1;
         }
 
         return smoothedCount;
diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index 64a35e4..ab41c3d 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -9,6 +9,16 @@ public class SpellCorrector {
 
     final static char[] ALPHABET = "abcdefghijklmnopqrstuvwxyz'".toCharArray();
 
+    /**
+     * Lambda values for interpolation of n-gram probabilities. The first value
+     * is for unigrams, the second for bigrams, etc.
+     */
+    private final static double[] LAMBDAS = new double[]{.5, .5};
+    /**
+     * The language model probability for uncorrected words.
+     */
+    private final static double LM_PROBABILITY_UNMODIFIED = .95;
+
     public SpellCorrector(CorpusReader cr, ConfusionMatrixReader cmr) {
         this.cr = cr;
         this.cmr = cmr;
@@ -74,6 +84,7 @@ public class SpellCorrector {
             // add-one smoothing
             p_channel = (correctionCount + 1) / (errorCount + 1);
 
+            // TODO: take the max instead of addition?
             // Sum the probabilities as independent modifications can result in
             // the same word ("acess" -> "access" by "a|ac", "e|ce").
             double p = candidates.getOrDefault(word2, 0.0);
@@ -194,7 +205,8 @@ public class SpellCorrector {
             this.word_likelihoods = new double[words.length];
             this.words_readonly = new boolean[words.length];
             for (int i = 0; i < words.length; i++) {
-                word_likelihoods[i] = getWordLikelihood(i, words[i], 0.5);
+                word_likelihoods[i] = getWordLikelihood(i, words[i],
+                        LM_PROBABILITY_UNMODIFIED);
             }
         }
 
@@ -204,37 +216,38 @@ public class SpellCorrector {
          */
         public double getWordLikelihood(int index, String word,
                 double channel_probability) {
-            String old_word = words[index];
-            double likelihood;
+            String prev_word, ngram;
+            double prior, score, p;
             // a suggested word not in the vocabulary is certainly wrong,
             // changed (or consequentive) words should also not be changed.
             if (!cr.inVocabulary(word) || words_readonly[index]) {
                 return 0.0;
             }
 
-            // Find prior
-            likelihood = (cr.getNGramCount(words[index]) + .5) / cr.getUnigramCount();
-            // Add probability of bi-grams.
-            if (index > 0) {
-                String str;
-                str = words[index - 1] + " " + words[index];
-                likelihood *= (cr.getNGramCount(str) + .5) / cr.getNGramCount(words[index - 1]);
-            }
+            assert channel_probability > 0.0;
 
-            if (words[index].equals(word)) {
-                // TODO: tune and magic number
-                likelihood *= .80;
-            } else {
-                likelihood *= channel_probability;
-            }
+            // P(x|w) is given by language model (noisy channel probability).
+            // Find prior P(w) = (freq(w) + .5) / N.
+            // Then compute candidate score by P(w) * P(x|w)
+            prior = (cr.getNGramCount(word) + .5) / cr.getUnigramCount();
+            score = prior * channel_probability;
 
-            // old word is unknown, let's make sure to change it first.
-            if (!cr.inVocabulary(old_word)) {
-                likelihood += 10;
+            // Now obtain n-gram probabilities. Use interpolation to combine
+            // unigrams and bigrams.
+            p = LAMBDAS[0] * cr.getSmoothedCount(word);
+
+            // Add probability of bi-grams.
+            // For words u and w, P(w|u) = P(u, w) / P(u).
+            if (index > 0) {
+                prev_word = words[index - 1];
+                ngram = prev_word + " " + word;
+                p += LAMBDAS[1] * cr.getSmoothedCount(ngram) / cr.getSmoothedCount(prev_word);
             }
 
-            assert likelihood > 0.0 : "failed probability for " + word;
-            return likelihood;
+            // Combine the candidate score with the n-gram probabilities.
+            p *= score;
+            assert p > 0.0 : "failed probability for " + word;
+            return p;
         }
 
         /**
@@ -290,7 +303,8 @@ public class SpellCorrector {
             }
             best_modification = null;
             best_likelihood = Double.MIN_VALUE;
-            word_likelihoods[index] = getWordLikelihood(index, word, 0.5);
+            word_likelihoods[index] = getWordLikelihood(index, word,
+                    LM_PROBABILITY_UNMODIFIED);
         }
 
         private class WordModification {
-- 
cgit v1.2.1