From c6b64fa4c9b363379cbdc470cad782412e2db398 Mon Sep 17 00:00:00 2001
From: Peter Wu <peter@lekensteyn.nl>
Date: Sat, 4 Apr 2015 12:53:45 +0200
Subject: Labda interpolation works bad

It favors large unigram probabilities too much.
---
 spellchecker/src/SpellCorrector.java | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/spellchecker/src/SpellCorrector.java b/spellchecker/src/SpellCorrector.java
index 0bc5b41..fbbfbef 100644
--- a/spellchecker/src/SpellCorrector.java
+++ b/spellchecker/src/SpellCorrector.java
@@ -16,12 +16,6 @@ public class SpellCorrector {
      */
     private final static int NGRAM_N = 2;
 
-    /**
-     * Lambda values for interpolation of n-gram probabilities. The first value
-     * is for unigrams, the second for bigrams, etc.
-     */
-    private final static double[] LAMBDAS = new double[]{.25, .75};
-
     /**
      * The language model probability for uncorrected words.
      */
@@ -321,7 +315,7 @@ public class SpellCorrector {
 
             // compute unigram component of language model: P(w)
             igram_p = cr.getNgramProbability(word, "");
-            prior = LAMBDAS[0] * igram_p;
+            prior = igram_p;
             if (debug_word != null) {
                 debug_word += " 1p=" + igram_p;
             }
@@ -344,7 +338,7 @@ public class SpellCorrector {
                     // no metrics found, cannot deduce much information from it
                     igram_p = .5;
                 }
-                prior += LAMBDAS[i] * igram_p;
+                prior *= igram_p;
                 if (debug_word != null) {
                     debug_word += " " + (i + 1) + "p=" + igram_p;
                 }
@@ -352,9 +346,7 @@ public class SpellCorrector {
 
             // Finally combine probabilities using the Noisy Channel Model.
             // P(x|w) is given by language model (noisy channel probability).
-            // The prior here is different from Kernighans article. Instead of
-            // P(w) = (freq(w) + .5) / N (N is number of words), we use an
-            // interpolation of ngram probabilities.
+            // Here the prior is a combination of ngram probabilities.
             // The candidate score is finally computed by P(w) * P(x|w)
             p = prior * channel_probability;
 
-- 
cgit v1.2.1