From 864afee73644051b5d4e1b21760f92f5659010da Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Sat, 4 Apr 2015 10:53:00 +0200
Subject: Do not count unigrams twice

---
 spellchecker/src/SpellCorrector.java | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index a435f74..2e03f26 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -274,7 +274,7 @@ public class SpellCorrector {
          * position {@code index}.
          */
         public double getWordLikelihood(int index, double channel_probability) {
-            double prior, score, p, igram_p;
+            double prior, p, igram_p;
             String word = words[index];
 
             // a suggested word not in the vocabulary is certainly wrong
@@ -291,23 +291,14 @@ public class SpellCorrector {
                 debug_word = "";
             }
 
-            // P(x|w) is given by language model (noisy channel probability).
-            // Find prior P(w) = (freq(w) + .5) / N (N is number of words).
-            // Then compute candidate score by P(w) * P(x|w)
-            // Note: Kernighan uses + .5 here to compensate for unknown words,
-            // but that assigns too much value to the score so do not do that.
-            // (all words are in the vocubalary anyway).
-            prior = (cr.getNGramCount(word) + 0.0) / cr.getWordCount();
-            score = prior * channel_probability;
-
-            // compute unigrams
+            // compute unigram component of language model: P(w)
             igram_p = cr.getNgramProbability(word, "");
-            p = LAMBDAS[0] * igram_p;
+            prior = LAMBDAS[0] * igram_p;
             if (debug_word != null) {
                 debug_word += " 1p=" + igram_p;
             }
 
-            // compute bigrams, etc.
+            // compute bigrams (P(w|prefix)), etc.
             String ngram = "";
             for (int i = 1; i < NGRAM_N; i++) {
                 // are there actually enough words to compute this metric?
@@ -325,18 +316,23 @@ public class SpellCorrector {
                     // no metrics found, cannot deduce much information from it
                     igram_p = .5;
                 }
-                p += LAMBDAS[i] * igram_p;
+                prior += LAMBDAS[i] * igram_p;
                 if (debug_word != null) {
                     debug_word += " " + (i + 1) + "p=" + igram_p;
                 }
             }
 
-            // finally combine the score with ngram probabilities
-            //p = .1 * score + .9 * p;
-            p *= score;
+            // Finally combine probabilities using the Noisy Channel Model.
+            // P(x|w) is given by language model (noisy channel probability).
+            // The prior here is different from Kernighans article. Instead of
+            // P(w) = (freq(w) + .5) / N (N is number of words), we use an
+            // interpolation of ngram probabilities.
+            // The candidate score is finally computed by P(w) * P(x|w)
+            p = prior * channel_probability;
+
             if (debug_word != null) {
                 System.err.println("# " + word + " p=" + p
-                        + " score=" + score + " chan=" + channel_probability
+                        + " chan=" + channel_probability
                         + " prior=" + prior + debug_word);
             }
             assert p > 0.0 : "failed probability for " + word;
-- 
cgit v1.2.1